In [382]:
import re
import pandas as pd
import plotly.express as px
import spacy
from copy import deepcopy

## Information on answers
Which answer corresponds to which prompt, etc

In [383]:
info_answers = pd.read_csv("info_answers.csv")
print(info_answers.shape)
info_answers.head(3)

(144, 5)


Unnamed: 0,form,type_question,data,type_answer,metric
0,f1,summary,French_Revolution,triples_dbpedia,granularity
1,f1,summary,French_Revolution,triples_dbpedia,relevance
2,f1,summary,French_Revolution,triples_dbpedia,succinctness


## Get answers from forms
granularity, relevance, diversity, succinctness

In [384]:
def read_answers(data_path):
    df = pd.read_csv(data_path).transpose()[:-1]
    time_info = df.iloc[0]
    df = df[1:]
    columns = list(df.columns)
    df.reset_index(inplace=True)
    df.columns = ["answer"] + columns
    
    return df.melt(id_vars='answer', value_vars=columns, var_name='n', value_name='value')

In [385]:
answers = {f"f{i}": read_answers(f"f{i}_answers.csv") for i in ["1", "2"]}
answers["f1"].head(3)

Unnamed: 0,answer,n,value
0,"Answer:\n""""""\nThe French Revolution, a period ...",0,3
1,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
2,"Answer:\n""""""\nThe French Revolution, a period ...",0,2


In [386]:
def build_data(answers, info_answers, columns):
    data_list = []
    for form, info in answers.items():
        info_answer = info_answers[info_answers.form == form].reset_index(drop=True)
        for n in info.n.unique():
            curr_df = pd.concat([info_answer, info[info.n == n].reset_index(drop=True)], axis=1)
            curr_df.reset_index(drop=True, inplace=True)
            data_list.append(curr_df)
    data = pd.concat(data_list, axis=0, ignore_index=True).reset_index(drop=True)
    return data

In [387]:
columns = list(info_answers.columns) + list(answers["f1"].columns)
data = build_data(answers=answers, info_answers=info_answers, columns=columns)
data

Unnamed: 0,form,type_question,data,type_answer,metric,answer,n,value
0,f1,summary,French_Revolution,triples_dbpedia,granularity,"Answer:\n""""""\nThe French Revolution, a period ...",0,3
1,f1,summary,French_Revolution,triples_dbpedia,relevance,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
2,f1,summary,French_Revolution,triples_dbpedia,succinctness,"Answer:\n""""""\nThe French Revolution, a period ...",0,2
3,f1,summary,French_Revolution,triples_dbpedia,diversity,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
4,f1,summary,French_Revolution,base,granularity,"Answer:\n""""""\nThe French Revolution, spanning ...",0,3
...,...,...,...,...,...,...,...,...
715,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_dbpedia,diversity,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,5
716,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_ng,granularity,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,4
717,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_ng,relevance,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,5
718,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_ng,succinctness,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,5


## Add groundedness

In [388]:
def clean_answer(text):
    new_text = re.sub(r'""" \[.+\]', '', text)
    new_text = re.sub(r'"""\n \[.+\]', '', new_text)
    new_text = re.sub(r'Answer: \n"""\n', '', new_text)
    new_text = re.sub(r'Answer:\n"""\n', '', new_text)
    new_text = re.sub(r'Answer:  \n"""\n', '', new_text)
    return new_text

grounding = pd.read_csv("grounding.csv")
grounding["answer_pp"] = grounding["answer"].apply(clean_answer)
grounding.groundedness = grounding.groundedness.str.replace(',', '.').astype(float)
print(grounding.shape)
grounding.head(3)

(36, 6)


Unnamed: 0,answer,events,nb_events,nb_mentioned,groundedness,answer_pp
0,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,2,0.909091,"Between January 1, 1792, and January 1, 1793, ..."
1,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,8,3.636364,"Between January 1, 1792, and January 1, 1793, ..."
2,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,2,0.909091,"Between January 1, 1792, and January 1, 1793, ..."


In [389]:
data_grounded = deepcopy(data[["form", "type_question", "data", "type_answer", "answer"]])
data_grounded["answer_pp"] = data_grounded["answer"].apply(clean_answer)
data_grounded = data_grounded.drop_duplicates().sort_values(by="answer_pp")
print(data_grounded.shape)
data_grounded.head(3)

(144, 6)


Unnamed: 0,form,type_question,data,type_answer,answer,answer_pp
48,f1,actor_event,Antoine_Balland,base,"Answer:\n""""""\nAntoine Balland, a French priest...","Antoine Balland, a French priest, was executed..."
49,f1,actor_event,Antoine_Balland,base,"Answer:\n""""""\nAntoine Balland, a French priest...","Antoine Balland, a French priest, was executed..."
50,f1,actor_event,Antoine_Balland,base,"Answer:\n""""""\nAntoine Balland, a French priest...","Antoine Balland, a French priest, was executed..."


In [390]:
data_grounded = pd.merge(data_grounded, grounding[["answer_pp", "groundedness"]], on='answer_pp', how='left')
data_grounded["metric"] = "groundedness"
data_grounded["n"] = -1
data_grounded = data_grounded.rename(columns={"groundedness": "value"})
data_grounded["answer"] = 'Answer:\n"""\n' + data_grounded["answer_pp"] + '""" \[Groundedness]'
data_grounded = data_grounded[data.columns].drop_duplicates()
print(data_grounded.shape)
data_grounded.head(3)

(36, 8)


Unnamed: 0,form,type_question,data,type_answer,metric,answer,n,value
0,f1,actor_event,Antoine_Balland,base,groundedness,"Answer:\n""""""\nAntoine Balland, a French priest...",-1,0.0
4,f2,cause_consequence,Battle_of_Winterthur,triples_dbpedia,groundedness,"Answer:\n""""""\nAt the end of the Battle of Wint...",-1,5.0
8,f2,cause_consequence,Battle_of_Winterthur,triples_ng,groundedness,"Answer:\n""""""\nAt the end of the Battle of Wint...",-1,5.0


In [391]:
data = pd.concat([data, data_grounded], axis=0, ignore_index=True).reset_index(drop=True)
print(data.shape)
data.head(3)

(756, 8)


Unnamed: 0,form,type_question,data,type_answer,metric,answer,n,value
0,f1,summary,French_Revolution,triples_dbpedia,granularity,"Answer:\n""""""\nThe French Revolution, a period ...",0,3
1,f1,summary,French_Revolution,triples_dbpedia,relevance,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
2,f1,summary,French_Revolution,triples_dbpedia,succinctness,"Answer:\n""""""\nThe French Revolution, a period ...",0,2


## Analyse results

In [392]:
data.groupby(["type_answer", "metric"]).agg({"value": "mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,value
type_answer,metric,Unnamed: 2_level_1
base,diversity,3.833333
base,granularity,3.916667
base,groundedness,1.110942
base,relevance,4.183333
base,succinctness,4.216667
triples_dbpedia,diversity,3.966667
triples_dbpedia,granularity,4.15
triples_dbpedia,groundedness,2.235281
triples_dbpedia,relevance,4.016667
triples_dbpedia,succinctness,3.35


In [395]:
def format(x):
    if str(x)[0].isdigit():
        return str(round(x, 2))
    return "\\texttt{" + str(x).replace("_", "\\_") + "}"

mean_res = data.groupby(["type_answer", "metric"]).agg({"value": "mean"})
latex_data = mean_res.reset_index().pivot(index="type_answer", columns="metric", values="value").reset_index()
columns = ["type_answer", "granularity", "relevance", "succinctness", "diversity", "groundedness"]
for _, row in latex_data.iterrows():
    print(' & '.join([format(row[x]) for x in columns]) + " \\" + "\\")

\texttt{base} & 3.92 & 4.18 & 4.22 & 3.83 & 1.11 \\
\texttt{triples\_dbpedia} & 4.15 & 4.02 & 3.35 & 3.97 & 2.24 \\
\texttt{triples\_ng} & 4.13 & 4.12 & 3.62 & 3.82 & 2.85 \\


In [140]:
for metric in data.metric.unique():
    fig = px.histogram(data[data.metric==metric], x="value", color="type_answer", barmode="group")
    print(f"Metric: {metric}")
    fig.show()

Metric: granularity


Metric: relevance


Metric: succinctness


Metric: diversity


In [156]:
data.groupby(["type_answer", "metric"]).agg({"value": "mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,value
type_answer,metric,Unnamed: 2_level_1
base,diversity,3.759259
base,granularity,3.888889
base,relevance,4.148148
base,succinctness,4.12963
triples_dbpedia,diversity,3.888889
triples_dbpedia,granularity,4.111111
triples_dbpedia,relevance,4.074074
triples_dbpedia,succinctness,3.351852
triples_ng,diversity,3.722222
triples_ng,granularity,4.055556


In [394]:
# Sample DataFrame
grouped = data.groupby(["type_answer", "metric"]).agg({"value": "mean"}).reset_index()

# Create scatter plot using Plotly Express
fig = px.scatter(grouped, x='metric', y='value', color='type_answer', #symbol='metric',
                 color_discrete_sequence=px.colors.qualitative.Set1,
                 #symbol_sequence=['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up']
                 )

# Update layout
fig.update_layout(
    title='Scatter Plot',
    xaxis=dict(title='Type Question'),
    yaxis=dict(title='Value', range=[1,5])
)

# Show the plot
fig.show()

Unnamed: 0,answer,events,nb_events,answer_pp,nb_mentioned
0,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,"Between January 1, 1792, and January 1, 1793, ...",2
1,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,"Between January 1, 1792, and January 1, 1793, ...",4
2,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,"Between January 1, 1792, and January 1, 1793, ...",1


In [183]:
grounding[["nb_events", "nb_mentioned"]]

Unnamed: 0,nb_events,nb_mentioned
0,11,2
1,11,4
2,11,1
3,1,1
4,1,1
5,1,1
6,5,0
7,5,1
8,5,1
9,3,1
