In [1]:
import pickle
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load pickle
with open("ticket_evaluations.pkl", "rb") as f:
    evals = pickle.load(f)

# If each eval is just the output dict
df = pd.DataFrame(evals)

In [2]:
# 2) Add response_appropriateness (all criteria must be True)
criteria = ["contextual_relevance", "technical_accuracy", "professional_tone", "actionable_guidance"]
df["response_appropriateness"] = df[criteria].all(axis=1)

In [3]:
# 3) Counts per criterion (True/False)
counts = df[criteria + ["response_appropriateness"]].apply(pd.Series.value_counts).T.fillna(0).astype(int)
counts_reset = counts.reset_index().melt(id_vars="index", var_name="Value", value_name="Count")
counts_reset.rename(columns={"index": "Criterion"}, inplace=True)

In [4]:
fig_counts = px.bar(
    counts_reset,
    x="Criterion",
    y="Count",
    color="Value",
    barmode="group",
    title="True/False Counts by Criterion",
)
fig_counts.update_layout(xaxis_title="Criterion", yaxis_title="Count")
fig_counts.show()

In [7]:
# Calculate percentages per category
percentages = (df.mean() * 100).round(2)  # mean of booleans gives proportion True
percentages = percentages.to_frame(name="Percentage True").reset_index()
percentages.rename(columns={"index": "Criterion"}, inplace=True)

In [8]:
percentages

Unnamed: 0,Criterion,Percentage True
0,contextual_relevance,93.0
1,technical_accuracy,97.0
2,professional_tone,97.0
3,actionable_guidance,97.0
4,response_appropriateness,91.0
