In [None]:
# Software Name : DistFactAssessLM
# SPDX-FileCopyrightText: Copyright (c) 2025 Orange SA
# SPDX-License-Identifier: GPL-2.0-or-later

# This software is distributed under the GNU General Public License v2.0 or later,
# see the "LICENSE.txt" file for more details or GNU General Public License v2.0 or later

# Authors: Hichem Ammar Khodja
# Software description: A factual knowledge assessment method for large language models using distractors

# Verbalization errors taxonomy

In this notebook, we analyze what are the most popular template-based verbalization errors. The annotation of errors was performed by one of the authors of the paper (Hichem Ammar Khodja).

In this notebook:

1. We provide the distribution of template-based verbalizations errors
2. We show that these errors are very common: about 1/2 of template-based verbalizations contain one or many errors.

The meaning of each error can be found in the paper's appendix.

In [1]:
import pandas as pd
import os.path as osp
import os
from statsmodels.stats.proportion import proportion_confint

In [2]:
CONTEXT = 'present' # accepted CONTEXT = ['present', 'past', 'both']

In [None]:
df = pd.read_csv(osp.join(os.getcwd(), 'taxonomy_hichem.csv'), index_col=0)
df = df[df['errors'].str.len() > 0]
df = df[df['errors'] != 'ignore']
df['errors'] = df['errors'].apply(lambda x : ['perfect'] if x == '-' else x.replace(' ', '').split(','))

if CONTEXT != 'both':
    df = df[df['context'] == CONTEXT]

In [6]:
# Flatten the list of lists with enumeration to keep track of the original list
flattened_with_index = [(i, label) for i, sublist in enumerate(df['errors']) for label in sublist]

# Convert to DataFrame
df2 = pd.DataFrame(flattened_with_index, columns=['index', 'label'])

# Get dummies
dummies = pd.get_dummies(df2['label'])

# Add the index back to the dummies DataFrame
dummies['index'] = df2['index']

# Group by index and sum to get the original structure
result = dummies.groupby('index').sum().reset_index(drop=True)

# metrics = "a_an_error","adj_noun_conjug","obj_deter","obj_deter_del","perfect","sub_deter","word_added","wrong_language","out_of_subject", "sub_deter_del"
metrics = "adj_noun_conjug","obj_deter","perfect","sub_deter","wrong_language","out_of_subject"


for m in metrics:
    if m not in result.columns:
        result[m] = 0
result.head()

Unnamed: 0,adj_noun_conjug,better_verb,obj_deter,obj_unclear,perfect,sub_deter,sub_unclear,wrong_conjug,wrong_language,out_of_subject
0,0,0,0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0


In [7]:
import math
def confidence_interval(frame : pd.DataFrame):
    m = frame.mean(axis=0, skipna=True)
    s = frame.std(axis=0, skipna=True)
    c = len(frame) - frame.isna().sum(axis=0)
    d = 1.96*s/math.sqrt(c)

    if frame.name not in ('execution_time', 'ngram_entropy'):
        m *= 100
        d *= 100

    # return '%.1f ± %.1f' % (m,d)
    a,b = proportion_confint(frame.sum(), len(frame), method='wilson')

    return '(%.1f, %.1f)' % (a*100,b*100)

In [9]:
result2 = result.agg(confidence_interval).to_frame().reset_index()
result2.columns = ['Error type', "Wilson's Conf. int."]
result2.insert(1, 'Proportion', (result.mean()*100).apply(lambda x : "%.1f" % x).tolist())
result2.sort_values('Error type', inplace=True)
result2

Unnamed: 0,Error type,Proportion,Wilson's Conf. int.
0,adj_noun_conjug,1.3,"(0.2, 7.0)"
1,better_verb,3.9,"(1.3, 10.8)"
2,obj_deter,13.0,"(7.2, 22.3)"
3,obj_unclear,5.2,"(2.0, 12.6)"
9,out_of_subject,0.0,"(-0.0, 4.8)"
4,perfect,54.5,"(43.5, 65.2)"
5,sub_deter,14.3,"(8.2, 23.8)"
6,sub_unclear,5.2,"(2.0, 12.6)"
7,wrong_conjug,14.3,"(8.2, 23.8)"
8,wrong_language,0.0,"(-0.0, 4.8)"


In [10]:
print(result2.to_latex(index=False, escape=True))

\begin{tabular}{lll}
\toprule
Error type & Proportion & Wilson's Conf. int. \\
\midrule
adj\_noun\_conjug & 1.3 & (0.2, 7.0) \\
better\_verb & 3.9 & (1.3, 10.8) \\
obj\_deter & 13.0 & (7.2, 22.3) \\
obj\_unclear & 5.2 & (2.0, 12.6) \\
out\_of\_subject & 0.0 & (-0.0, 4.8) \\
perfect & 54.5 & (43.5, 65.2) \\
sub\_deter & 14.3 & (8.2, 23.8) \\
sub\_unclear & 5.2 & (2.0, 12.6) \\
wrong\_conjug & 14.3 & (8.2, 23.8) \\
wrong\_language & 0.0 & (-0.0, 4.8) \\
\bottomrule
\end{tabular}



In [11]:
perfect = ['perfect']
minor = ['obj_deter', 'sub_deter', 'obj_unclear', 'sub_unclear', 'adj_noun_conjug', 'better_verb', 'wrong_conjug']

blunders = ['out_of_subject', 'wrong_language']

df2 = pd.DataFrame({
    'perfect' : result[perfect].sum(1),
    'only minor' : result[minor].sum(1),
    # '\\geq 1 significant' : result[significant].sum(1),
    '\\geq 1 blunder' : result[blunders].sum(1)
})
df2 = df2.astype(bool)
# df2.loc[df2['only minor'] & df2['\\geq 1 significant'], 'only minor'] = False
df2 = df2.astype(int)
df2_ = df2.agg(confidence_interval).to_frame().reset_index()
df2_.columns = ['Error type', "Wilson's CI"]
df2_.insert(1, 'Proportion', (df2.mean()*100).apply(lambda x : "%.1f" % x).tolist())

df2_

Unnamed: 0,Error type,Proportion,Wilson's CI
0,perfect,54.5,"(43.5, 65.2)"
1,only minor,45.5,"(34.8, 56.5)"
2,\geq 1 blunder,0.0,"(-0.0, 4.8)"


In [8]:
print(df2_.to_latex(index=False))

\begin{tabular}{lll}
\toprule
Error type & Proportion & Wilson's CI \\
\midrule
perfect & 60.5 & (52.6, 67.9) \\
only minor & 39.5 & (32.1, 47.4) \\
\geq 1 blunder & 0.0 & (0.0, 2.5) \\
\bottomrule
\end{tabular}

