In [1]:
import numpy as np
import pandas as pd
import glob
import re
from tqdm import tqdm
import os.path
import collections
from collections import defaultdict
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px


In [2]:

folder_prompt_ = "../../../data/OpenAI/PromptsAnalysesData/"

file = "caribbean_df_orig_GT.csv"
df_caribbean_GT = pd.read_csv(F"{folder_prompt_}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')

file = "caribbean_df_orig_ChatGPT.csv"
df_caribbean_ChatGPT = pd.read_csv(F"{folder_prompt_}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')

In [3]:
# df_caribbean_GT = df_caribbean_GT.droplevel(0, axis=1) 
# df_caribbean_ChatGPT = df_caribbean_ChatGPT.droplevel(0, axis=1) 

# df_caribbean_GT.columns = df_caribbean_GT.columns.get_level_values(0)
# df_caribbean_ChatGPT.columns = df_caribbean_ChatGPT.columns.get_level_values(0)

# df_caribbean_GT.columns = [' '.join(col) for col in df_caribbean_GT.columns.values]
# df_caribbean_ChatGPT.columns = [' '.join(col) for col in df_caribbean_ChatGPT.columns.values]

In [4]:
# df_caribbean_ChatGPT.compare(df_caribbean_GT)
# df_caribbean_ChatGPT.compare(df_caribbean_GT,
#                         # keep_shape=True,
#                         keep_equal=False,
#                         )

df = df_caribbean_GT.subtract(df_caribbean_ChatGPT)
df = df.fillna("Fail")
df = df.applymap(lambda x: 'Correct' if isinstance(x, (int, float)) and x == 0 else x)
df = df.applymap(lambda x: 'Incorrect' if isinstance(x, (int, float)) and x == -1 else x)
df = df.applymap(lambda x: 'Incorrect' if isinstance(x, (int, float)) and x == 1 else x)

In [5]:
df

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bursera karsteniana,Correct,Correct,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,...,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail
Metopium brownei,Correct,Incorrect,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,...,Fail,Fail,Fail,Fail,Correct,Incorrect,Correct,Correct,Correct,Correct
Handroanthus billbergii,Correct,Correct,Fail,Fail,Fail,Fail,Fail,Correct,Correct,Incorrect,...,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail
Avicennia germinans,Correct,Correct,Fail,Fail,Fail,Fail,Fail,Correct,Correct,Correct,...,Fail,Fail,Fail,Fail,Incorrect,Incorrect,Incorrect,Correct,Incorrect,Incorrect
Bourreria succulenta,Correct,Correct,Incorrect,Correct,Correct,Correct,Correct,Fail,Fail,Fail,...,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail


### Plotting

In [6]:
group_labels = [trait for (trait, _) in df.columns]
sub_labels = [sublabel.capitalize() for (_, sublabel) in df.columns]
number_labels = [number for number, (_, _) in enumerate(df.columns)]

correct_lst = []
incorrect_lst = []
fail_lst = []
for column in df.columns:

    # Correct
    value = df[column].str.count("Correct").sum() # Cannot sum directly
    correct_lst.append(value)

        # Correct
    value = df[column].str.count("Incorrect").sum() # Cannot sum directly
    incorrect_lst.append(value)

        # Correct
    value = df[column].str.count("Fail").sum() # Cannot sum directly
    fail_lst.append(value)

total = len(df_caribbean_ChatGPT.index)

correct_lst = np.array(correct_lst)
incorrect_lst = np.array(incorrect_lst)
fail_lst = np.array(fail_lst)

correct_lst = correct_lst / total * 100
incorrect_lst = incorrect_lst / total * 100
fail_lst = fail_lst / total * 100



In [7]:
fig = go.Figure()

predictions = [
    "Correct",
    "Incorrect",
    "No Information Found"
]

colors = [
    "green",
    "red",
    "blue"
]

lsts = [
    correct_lst,
    incorrect_lst,
    fail_lst
]

x = [
    group_labels,
    number_labels
]

for (prediction, color, lst) in zip(predictions, colors, lsts):
    fig.add_bar(
        name=prediction,
        x=x,
        y=lst,
        marker_color=color,
        constraintext='outside',
        hovertext=sub_labels,
        textposition='auto'
        )
    
fig.update_layout(
    # uniformtext_minsize=8, uniformtext_mode='hide',
    title='ChatGPT Predictions',
    # xaxis_tickfont_size=6,
    yaxis=dict(
        title='Percentage',
        titlefont_size=16,
        # tickfont_size=3,
    ),

    barmode='relative',
    bargap=0.0, # gap between bars of adjacent location coordinates.
    bargroupgap=0.00, # gap between bars of the same location coordinate.
    
)

fig.show()


folder_figures = "../../../reports/figures/"
file_name = "PxPlot_HTML_BarchartPredictions.html"
fig.write_html(F"{folder_figures}{file_name}")
file_name = "PxPlot_PDF_BarchartPredictions.pdf"
fig.write_image(F"{folder_figures}{file_name}")

In [10]:
top_labels = ['Strongly<br>agree', 'Agree', 'Neutral', 'Disagree',
              'Strongly<br>disagree']

colors = ['rgba(38, 24, 74, 0.8)', 'rgba(71, 58, 131, 0.8)',
          'rgba(122, 120, 168, 0.8)', 'rgba(164, 163, 204, 0.85)',
          'rgba(190, 192, 213, 1)']

x_data = [[21, 30, 21, 16, 12],
          [24, 31, 19, 15, 11],
          [27, 26, 23, 11, 13],
          [29, 24, 15, 18, 14]]

y_data = ['The course was effectively<br>organized',
          'The course developed my<br>abilities and skills ' +
          'for<br>the subject', 'The course developed ' +
          'my<br>ability to think critically about<br>the subject',
          'I would recommend this<br>course to a friend']

fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=14,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd,
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=14,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

fig.update_layout(annotations=annotations)

fig.show()

In [17]:
df

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bursera karsteniana,Correct,Correct,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,...,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail
Metopium brownei,Correct,Incorrect,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,...,Fail,Fail,Fail,Fail,Correct,Incorrect,Correct,Correct,Correct,Correct
Handroanthus billbergii,Correct,Correct,Fail,Fail,Fail,Fail,Fail,Correct,Correct,Incorrect,...,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail
Avicennia germinans,Correct,Correct,Fail,Fail,Fail,Fail,Fail,Correct,Correct,Correct,...,Fail,Fail,Fail,Fail,Incorrect,Incorrect,Incorrect,Correct,Incorrect,Incorrect
Bourreria succulenta,Correct,Correct,Incorrect,Correct,Correct,Correct,Correct,Fail,Fail,Fail,...,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail,Fail


In [58]:
df.columns.get_level_values(0).unique()

Index(['Life form', 'Leaf position', 'Leaf composition', 'Leaf shape',
       'Leaf margin', 'Leaf upper side', 'Leaf lower side', 'Leaf glands',
       'Leaf rachis', 'Thorns spines', 'Stipules', 'Inflorescence type',
       'Sepals calyx shape', 'Sepals calyx numer', 'Petals corolla shape',
       'Petals corolla number', 'Petals corolla colour', 'Stamen shape',
       'Stamen number', 'Fruit type', 'Fruit shape', 'Fruit colour',
       'Aril colour', 'Seed colour'],
      dtype='object')

In [74]:
x_data = []
y_data = []
for column in df.columns.get_level_values(0).unique():

    values = df[column].values
    size = df[column].values.size

    correct = round((values=='Correct').sum() / size * 100, 0)
    incorrect = round((values=='Incorrect').sum() / size * 100, 0)
    fail = round((values=='Fail').sum() / size * 100, 0)

    x_data.append([correct, incorrect, fail])

    y_data.append(column)

In [88]:
top_labels = [
     'Correct', 
     'Incorrect', 
     'No Information']

colors = [
     'green', 
     'red', 
     'blue']



fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
   
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=14,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd,
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=14,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

fig.update_layout(annotations=annotations, width=1300, height=800)
fig.update_yaxes(automargin=True)

fig.show()