In [14]:
import numpy as np
import pandas as pd
import glob
import re
from tqdm import tqdm
import os.path
import collections
from collections import defaultdict
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
from datetime import datetime


In [17]:
date = datetime.today().strftime('%Y%m%d')

In [8]:

folder_prompt_ = "../../../data/OpenAI/PromptsAnalysesData/OriginalData/"

file = "caribbean_df_orig_GT.csv"
df_caribbean_GT = pd.read_csv(F"{folder_prompt_}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')

file = "caribbean_df_orig_ChatGPT.csv"
df_caribbean_ChatGPT = pd.read_csv(F"{folder_prompt_}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')

In [9]:
df = df_caribbean_GT.subtract(df_caribbean_ChatGPT)
df = df.fillna("Fail")
df = df.applymap(lambda x: 'Correct' if isinstance(x, (int, float)) and x == 0 else x)
df = df.applymap(lambda x: 'Incorrect' if isinstance(x, (int, float)) and x == -1 else x)
df = df.applymap(lambda x: 'Incorrect' if isinstance(x, (int, float)) and x == 1 else x)

In [10]:
# Save for future plotting
folder_plotting = "../../../data/plotting/"
file_name = "df_caribbean_textdata.csv"
df.to_csv(F"{folder_plotting}{file_name}")

### Metric Scores

#### F1

In [26]:
gt_array = df_caribbean_GT.values.flatten()
pred_array = df_caribbean_ChatGPT.values.flatten()

pred_array[np.isnan(pred_array)] = -1

# Remove NaN values from pred_array
# nan_indices = np.isnan(pred_array)
# gt_array = gt_array[~nan_indices]
# pred_array = pred_array[~nan_indices]

# imp = SimpleImputer(strategy='most_frequent', missing_values=np.NaN)
# pred_array = imp.fit_transform(pred_array)

f1 = f1_score(gt_array, pred_array, average='weighted')
print(f1)

0.8205140726779405


### Plotting

#### Bar Chart

In [None]:
x_data = []
y_data = []
for column in df.columns.get_level_values(0).unique():

    values = df[column].values
    size = df[column].values.size

    correct = round((values=='Correct').sum() / size * 100, 2)
    incorrect = round((values=='Incorrect').sum() / size * 100, 2)
    fail = round((values=='Fail').sum() / size * 100, 2)

    x_data.append([correct, incorrect, fail])

    y_data.append(column)

x_data = np.array(x_data)
y_data = np.array(y_data)

sorted_indices = np.argsort(x_data[:, 0])

x_data = x_data[sorted_indices]
y_data = y_data[sorted_indices]

In [None]:
top_labels = [
     'Correct', 
     'Incorrect', 
     'No Information']

colors = [
     '#2ca02c', 
     '#d62728', 
     '#1f77b4']



fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))

    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2),                                     
                                    y=yd,
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            space += xd[i]

# Labeling top axis
for idx, label in enumerate(top_labels):
    annotations.append(dict(xref='paper', yref='paper',
                            x=idx/3 + 0.225, y=1.081,
                            text=F"<b>{label}<b>",
                            font=dict(family='Arial', size=18,
                                        color=colors[idx]),
                            showarrow=False))


# Remove low values
for idx, annotation in enumerate(annotations):
    value = annotation['text'][0:-1]
    try:
        value = float(value)
        if value < 5:
            annotations[idx]['text'] = ""
    except:
        continue
    
fig.update_layout(annotations=annotations, 
                  width=1100, height=800,
                  bargap=0.1,
                  title_text='<b>ChatGPT Prediction Caribbean Dataset<b><br>Text Data',title_x=0.6, title_y=0.95, )


# fig.update_yaxes(automargin=True)

fig.show()

folder_figures = "../../../reports/figures/"
file_name = "PxPlot_HTML_BarchartPredictions_TextData.html"
fig.write_html(F"{folder_figures}{file_name}")
file_name = "PxPlot_PDF_BarchartPredictions_TextData.pdf"
fig.write_image(F"{folder_figures}{file_name}")

In [None]:
# Create custom legend using annotations
# legend_items = [
#     {'name': 'Correct', 'color': '#2ca02c', 'ypos': 1.09},
#     {'name': 'Incorrect', 'color': '#d62728', 'ypos': 1.06},
#     {'name': 'No Information', 'color': '#1f77b4', 'ypos': 1.03},
# ]

# for item in legend_items:
#     fig.add_shape(
#         type='rect',
#         xref='paper', x0=0.05, x1=0.1,
#         yref='paper', y0=item['ypos'], y1=item['ypos'] + 0.02,
#         fillcolor=item['color'],
#         line=dict(color=item['color'], width=1)
#     )
#     fig.add_annotation(
#         xref='paper', x=0.11, yref='paper', y=item['ypos'] + 0.02,
#         text=item['name'],
#         showarrow=False,
#         font=dict(size=10)
#     )

#### HeatMap

In [27]:
# Data
gt_array = df_caribbean_GT.values
pred_array = df_caribbean_ChatGPT.values
heat = gt_array - pred_array
heat[(heat == 1) | (heat == -1)] = 0.5
heat[heat == 0] = 1
heat[np.isnan(heat)] = 0

# Color Scale
color_scale = [
    (0.00, "#1f77b4"), (0.33, "#1f77b4"),
    (0.33, "#d62728"), (0.66, "#d62728"),
    (0.66, "#2ca02c"), (1.00, "#2ca02c")
    ]

# Labels
y_labels = list(df_caribbean_ChatGPT.index)

# x ticks
text_pos_dict = collections.defaultdict(list)
x_tickvals = []
x_ticktext = []
v_lines = []

columns = np.array([trait for trait, _ in df_caribbean_ChatGPT.columns])
for idx, (column, _) in enumerate(df_caribbean_ChatGPT.columns):
    text_pos_dict[column].append(idx)

for text, lst in text_pos_dict.items():
    x_ticktext.append(text)
    x_tickvals.append(sum(lst) / len(lst))
    v_lines.append(lst[-1] + 0.5)

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z=heat,
    colorscale=color_scale,
    legendwidth=10,
        colorbar=dict(
        title="Prediction",
        tickvals=[0.33/2, 0.5, 0.33/2*5],
        ticktext=["No Data Found", "Incorrect", "Correct"],
        ticks="outside",
        lenmode="pixels", len=100,),
    ))

fig.update_xaxes(tickangle=-25, tickfont=dict(size=10),
                 ticks="outside", tickwidth=2, ticklen=10,
                 title_text='Trait')
fig.update_yaxes(title_text='Species',
                 showticklabels=False)

for v_line in v_lines:
    fig.add_vline(x=v_line, line_width=2, line_color="black")

fig.update_layout(
    title_text=F'<b>ChatGPT Prediction Caribbean Dataset<b><br>Text Data, F1-score: {round(f1, 2)}',title_x=0.5, title_y=0.90,

    xaxis = dict(
        tickmode = 'array',
        tickvals = x_tickvals,
        ticktext = x_ticktext,
        ),

    width=1200, height=400,
)

fig.show()

# folder_figures = "../../../reports/figures/"
# file_name = F"PxPlot_HTML_HeatmapPredictions_TextData_{date}.html"
# fig.write_html(F"{folder_figures}{file_name}")
# file_name = F"PxPlot_PDF_HeatmapPredictions_TextData_{date}.pdf"
# fig.write_image(F"{folder_figures}{file_name}")