In [1]:
import numpy as np
import pandas as pd
import glob
import re
from tqdm import tqdm
import os.path
import collections
from collections import defaultdict
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px


In [2]:

folder_prompt_ = "../../../data/OpenAI/PromptsAnalysesData/ZeroShot/"

file = "caribbean_df_GT.csv"
df_caribbean_GT = pd.read_csv(F"{folder_prompt_}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')

file = "caribbean_df_ChatGPT.csv"
df_caribbean_ChatGPT = pd.read_csv(F"{folder_prompt_}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')

In [3]:
# df_caribbean_GT = df_caribbean_GT.droplevel(0, axis=1) 
# df_caribbean_ChatGPT = df_caribbean_ChatGPT.droplevel(0, axis=1) 

# df_caribbean_GT.columns = df_caribbean_GT.columns.get_level_values(0)
# df_caribbean_ChatGPT.columns = df_caribbean_ChatGPT.columns.get_level_values(0)

# df_caribbean_GT.columns = [' '.join(col) for col in df_caribbean_GT.columns.values]
# df_caribbean_ChatGPT.columns = [' '.join(col) for col in df_caribbean_ChatGPT.columns.values]

In [4]:
# df_caribbean_ChatGPT.compare(df_caribbean_GT)
# df_caribbean_ChatGPT.compare(df_caribbean_GT,
#                         # keep_shape=True,
#                         keep_equal=False,
#                         )

df = df_caribbean_GT.subtract(df_caribbean_ChatGPT)
df = df.fillna("Fail")
df = df.applymap(lambda x: 'Correct' if isinstance(x, (int, float)) and x == 0 else x)
df = df.applymap(lambda x: 'Incorrect' if isinstance(x, (int, float)) and x == -1 else x)
df = df.applymap(lambda x: 'Incorrect' if isinstance(x, (int, float)) and x == 1 else x)

### Plotting

##### Old

In [5]:
# group_labels = [trait for (trait, _) in df.columns]
# sub_labels = [sublabel.capitalize() for (_, sublabel) in df.columns]
# number_labels = [number for number, (_, _) in enumerate(df.columns)]

# correct_lst = []
# incorrect_lst = []
# fail_lst = []
# for column in df.columns:

#     # Correct
#     value = df[column].str.count("Correct").sum() # Cannot sum directly
#     correct_lst.append(value)

#         # Correct
#     value = df[column].str.count("Incorrect").sum() # Cannot sum directly
#     incorrect_lst.append(value)

#         # Correct
#     value = df[column].str.count("Fail").sum() # Cannot sum directly
#     fail_lst.append(value)

# total = len(df_caribbean_ChatGPT.index)

# correct_lst = np.array(correct_lst)
# incorrect_lst = np.array(incorrect_lst)
# fail_lst = np.array(fail_lst)

# correct_lst = correct_lst / total * 100
# incorrect_lst = incorrect_lst / total * 100
# fail_lst = fail_lst / total * 100

# fig = go.Figure()

# predictions = [
#     "Correct",
#     "Incorrect",
#     "No Information Found"
# ]

# colors = [
#     "green",
#     "red",
#     "blue"
# ]

# lsts = [
#     correct_lst,
#     incorrect_lst,
#     fail_lst
# ]

# x = [
#     group_labels,
#     number_labels
# ]

# for (prediction, color, lst) in zip(predictions, colors, lsts):
#     fig.add_bar(
#         name=prediction,
#         x=x,
#         y=lst,
#         marker_color=color,
#         constraintext='outside',
#         hovertext=sub_labels,
#         textposition='auto'
#         )
    
# fig.update_layout(
#     # uniformtext_minsize=8, uniformtext_mode='hide',
#     title='ChatGPT Predictions',
#     # xaxis_tickfont_size=6,
#     yaxis=dict(
#         title='Percentage',
#         titlefont_size=16,
#         # tickfont_size=3,
#     ),

#     barmode='relative',
#     bargap=0.0, # gap between bars of adjacent location coordinates.
#     bargroupgap=0.00, # gap between bars of the same location coordinate.
    
# )

# fig.show()


# folder_figures = "../../../reports/figures/"
# file_name = "PxPlot_HTML_BarchartPredictions.html"
# fig.write_html(F"{folder_figures}{file_name}")
# file_name = "PxPlot_PDF_BarchartPredictions.pdf"
# fig.write_image(F"{folder_figures}{file_name}")

##### New

In [6]:
x_data = []
y_data = []
for column in df.columns.get_level_values(0).unique():

    values = df[column].values
    size = df[column].values.size

    correct = round((values=='Correct').sum() / size * 100, 2)
    incorrect = round((values=='Incorrect').sum() / size * 100, 2)
    fail = round((values=='Fail').sum() / size * 100, 2)

    x_data.append([correct, incorrect, fail])

    y_data.append(column)

x_data = np.array(x_data)
y_data = np.array(y_data)

sorted_indices = np.argsort(x_data[:, 0])

x_data = x_data[sorted_indices]
y_data = y_data[sorted_indices]

In [9]:
top_labels = [
     'Correct', 
     'Incorrect', 
     'No Information']

colors = [
     '#2ca02c', 
     '#d62728', 
     '#1f77b4']



fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))

    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2),                                     
                                    y=yd,
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            space += xd[i]

# Labeling top axis
for idx, label in enumerate(top_labels):
    annotations.append(dict(xref='paper', yref='paper',
                            x=idx/3 + 0.225, y=1.081,
                            text=F"<b>{label}<b>",
                            font=dict(family='Arial', size=18,
                                        color=colors[idx]),
                            showarrow=False))


# Remove low values
for idx, annotation in enumerate(annotations):
    value = annotation['text'][0:-1]
    try:
        value = float(value)
        if value < 5:
            annotations[idx]['text'] = ""
    except:
        continue
    
fig.update_layout(annotations=annotations, 
                  width=1100, height=800,
                  bargap=0.1,
                  title_text='<b>ChatGPT Prediction Caribbean Dataset<b><br>Zeroshot',title_x=0.6, title_y=0.95, )


# fig.update_yaxes(automargin=True)

fig.show()

# folder_figures = "../../../reports/figures/"
# file_name = "PxPlot_HTML_BarchartPredictions_ZeroShot.html"
# fig.write_html(F"{folder_figures}{file_name}")
# file_name = "PxPlot_PDF_BarchartPredictions_ZeroShot.pdf"
# fig.write_image(F"{folder_figures}{file_name}")

In [8]:
# Create custom legend using annotations
# legend_items = [
#     {'name': 'Correct', 'color': '#2ca02c', 'ypos': 1.09},
#     {'name': 'Incorrect', 'color': '#d62728', 'ypos': 1.06},
#     {'name': 'No Information', 'color': '#1f77b4', 'ypos': 1.03},
# ]

# for item in legend_items:
#     fig.add_shape(
#         type='rect',
#         xref='paper', x0=0.05, x1=0.1,
#         yref='paper', y0=item['ypos'], y1=item['ypos'] + 0.02,
#         fillcolor=item['color'],
#         line=dict(color=item['color'], width=1)
#     )
#     fig.add_annotation(
#         xref='paper', x=0.11, yref='paper', y=item['ypos'] + 0.02,
#         text=item['name'],
#         showarrow=False,
#         font=dict(size=10)
#     )