In [1]:
import pandas as pd
import pickle
import re

In [2]:
def load_cache(cache_fname):
    with open(cache_fname, 'rb') as fp:
        cache = pickle.load(fp)
    return cache

def retrieve_from_cache(cache, question, answer, narrative):

    """
    Use question, answer and narrative to retrieve all associated values
    Return failure if key not found
    """

    key = (question.lower(), answer.lower(), narrative.lower())
    try:
        return cache[key]
    except:
        return {'message': 'Key not found'}

In [3]:
cache = load_cache('../data/human_eval_cache.pkl')

In [4]:
def remove_punctuation(text):
    result = re.sub(r'[^\w\s]', '', text)
    return result

In [5]:
def overall_avg_likert(df, cache):
    likerts = []
    for idx, row in df.iterrows():
        question = row['question']
        try:
            answer = remove_punctuation(row['predicted_answer'])
        except:
            answer = remove_punctuation(row['answer'])
        narrative = row['narrative']
        key = (question, answer, narrative)
        info = retrieve_from_cache(cache, key[0], key[1], key[2])
        likertscores = info['val_annotations']
        likert = sum(likertscores) / len(likertscores)
        likerts.append(likert)
    print(f'Overall avg Likert for all answers {round(sum(likerts) / len(likerts), 2)}')

In [6]:
def overall_avg_binary_likert(df, cache):
    likerts = []
    for idx, row in df.iterrows():
        question = row['question']
        try:
            answer = remove_punctuation(row['predicted_answer'])
        except:
            answer = remove_punctuation(row['answer'])
        narrative = row['narrative']
        key = (question, answer, narrative)
        info = retrieve_from_cache(cache, key[0], key[1], key[2])
        likertscores = info['val_annotations']
        binary_likertscores = [0 if x < 1 else 1 for x in likertscores]
        likert = sum(binary_likertscores) / len(binary_likertscores)
        likerts.append(likert)
    print(f'Overall avg binary Likert for all answers: {round(sum(likerts) / len(likerts), 2)}')

In [7]:
def get_all_numbers(df, cache):
    overall_avg_likert(df, cache)
    overall_avg_binary_likert(df, cache)

In [8]:
def evaluate_df_by_onto(df, cache):
    
    conseq_df = df[df['onto'] == 'Consequence']
    print('Consequence')
    get_all_numbers(conseq_df, cache)
    
    goal_df = df[df['onto'] == 'Goal seeking']
    print('Goal seeking')
    get_all_numbers(goal_df, cache)
    
    reac_df = df[df['onto'] == 'Reactionary']
    print('Reactionary')
    get_all_numbers(reac_df, cache)
    
    desire_df = df[df['onto'] == 'Desire']
    print('Desire')
    get_all_numbers(desire_df, cache)
    
    other_df = df[df['onto'] == 'Other']
    print('Other')
    get_all_numbers(other_df, cache)

In [9]:
ontology_df = pd.read_csv('../data/hidden_test_set_ontology.csv')

In [10]:
meta_to_ontology_dict = {}
for idx, row in ontology_df.iterrows():
    meta_to_ontology_dict[row['question_meta']] = row['Ontology']

In [11]:
def add_onto_to_df(df, meta_to_ontology_dict):
    ontos = []
    for idx, row in df.iterrows():
        try:
            ontos.append(meta_to_ontology_dict[row['question_meta']])
        except:
            ontos.append(meta_to_ontology_dict[row['meta']])
    df['onto'] = ontos
    return df

In [12]:
onto_count_dict = ontology_df['Ontology'].value_counts().to_dict()

In [13]:
onto_count_dict

{'Consequence': 140,
 'Goal seeking': 135,
 'Reactionary': 118,
 'Desire': 41,
 'Other': 30}

In [14]:
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_w_n_separator.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Overall avg Likert for all answers 0.58
Overall avg binary Likert for all answers: 0.61


In [15]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.02
Overall avg binary Likert for all answers: 0.42


In [16]:
evaluate_df_by_onto(impl_t5_df, cache)

Consequence
Overall avg Likert for all answers -0.22
Overall avg binary Likert for all answers: 0.32
Goal seeking
Overall avg Likert for all answers 0.33
Overall avg binary Likert for all answers: 0.55
Reactionary
Overall avg Likert for all answers 0.25
Overall avg binary Likert for all answers: 0.49
Desire
Overall avg Likert for all answers 0.29
Overall avg binary Likert for all answers: 0.52
Other
Overall avg Likert for all answers -0.09
Overall avg binary Likert for all answers: 0.37


In [17]:
t5_knowl_df = pd.read_csv('../data/camera-ready-predictions/t5base_w_n_separator_w_knowl.csv')
t5_knowl_df = add_onto_to_df(t5_knowl_df, meta_to_ontology_dict)
get_all_numbers(t5_knowl_df, cache)

Overall avg Likert for all answers 0.91
Overall avg binary Likert for all answers: 0.73


In [18]:
impl_t5_knowl_df = t5_knowl_df[t5_knowl_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_knowl_df, cache)

Overall avg Likert for all answers 0.56
Overall avg binary Likert for all answers: 0.61


In [19]:
evaluate_df_by_onto(impl_t5_knowl_df, cache)

Consequence
Overall avg Likert for all answers 0.45
Overall avg binary Likert for all answers: 0.57
Goal seeking
Overall avg Likert for all answers 0.7
Overall avg binary Likert for all answers: 0.68
Reactionary
Overall avg Likert for all answers 0.8
Overall avg binary Likert for all answers: 0.71
Desire
Overall avg Likert for all answers 0.63
Overall avg binary Likert for all answers: 0.63
Other
Overall avg Likert for all answers 0.41
Overall avg binary Likert for all answers: 0.53


In [20]:
t511b_df = pd.read_csv('../data/camera-ready-predictions/t511b_w_n_separator.csv')
t511b_df = add_onto_to_df(t511b_df, meta_to_ontology_dict)
get_all_numbers(t511b_df, cache)

Overall avg Likert for all answers 1.21
Overall avg binary Likert for all answers: 0.84


In [21]:
impl_t511b_df = t511b_df[t511b_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t511b_df, cache)

Overall avg Likert for all answers 0.97
Overall avg binary Likert for all answers: 0.75


In [22]:
evaluate_df_by_onto(impl_t511b_df, cache)

Consequence
Overall avg Likert for all answers 0.93
Overall avg binary Likert for all answers: 0.72
Goal seeking
Overall avg Likert for all answers 0.97
Overall avg binary Likert for all answers: 0.75
Reactionary
Overall avg Likert for all answers 1.18
Overall avg binary Likert for all answers: 0.84
Desire
Overall avg Likert for all answers 0.98
Overall avg binary Likert for all answers: 0.76
Other
Overall avg Likert for all answers 0.88
Overall avg binary Likert for all answers: 0.74


In [23]:
# this file is the t511b with top 3 diverse comet verbalized
t511b_knowl_df = pd.read_csv('../data/camera-ready-predictions/t511b_w_n_separator_w_knowl.csv')
t511b_knowl_df = add_onto_to_df(t511b_knowl_df, meta_to_ontology_dict)
get_all_numbers(t511b_knowl_df, cache)

Overall avg Likert for all answers 1.27
Overall avg binary Likert for all answers: 0.85


In [24]:
impl_t511b_knowl_df = t511b_knowl_df[t511b_knowl_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t511b_knowl_df, cache)

Overall avg Likert for all answers 1.04
Overall avg binary Likert for all answers: 0.77


In [25]:
evaluate_df_by_onto(impl_t511b_knowl_df, cache)

Consequence
Overall avg Likert for all answers 1.01
Overall avg binary Likert for all answers: 0.74
Goal seeking
Overall avg Likert for all answers 1.29
Overall avg binary Likert for all answers: 0.87
Reactionary
Overall avg Likert for all answers 0.96
Overall avg binary Likert for all answers: 0.73
Desire
Overall avg Likert for all answers 1.08
Overall avg binary Likert for all answers: 0.82
Other
Overall avg Likert for all answers 0.9
Overall avg binary Likert for all answers: 0.74


In [26]:
gpt3_df = pd.read_csv('../data/camera-ready-predictions/gpt3.csv')
gpt3_df = add_onto_to_df(gpt3_df, meta_to_ontology_dict)
get_all_numbers(gpt3_df, cache)

Overall avg Likert for all answers 1.17
Overall avg binary Likert for all answers: 0.83


In [27]:
impl_gpt3_df = gpt3_df[gpt3_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_gpt3_df, cache)

Overall avg Likert for all answers 1.1
Overall avg binary Likert for all answers: 0.8


In [28]:
evaluate_df_by_onto(impl_gpt3_df, cache)

Consequence
Overall avg Likert for all answers 1.11
Overall avg binary Likert for all answers: 0.81
Goal seeking
Overall avg Likert for all answers 1.05
Overall avg binary Likert for all answers: 0.79
Reactionary
Overall avg Likert for all answers 1.09
Overall avg binary Likert for all answers: 0.78
Desire
Overall avg Likert for all answers 1.16
Overall avg binary Likert for all answers: 0.81
Other
Overall avg Likert for all answers 1.09
Overall avg binary Likert for all answers: 0.82


In [29]:
gpt3_knowl_df = pd.read_csv('../data/camera-ready-predictions/gpt3_w_knowl.csv')
gpt3_knowl_df = add_onto_to_df(gpt3_knowl_df, meta_to_ontology_dict)
get_all_numbers(gpt3_knowl_df, cache)

Overall avg Likert for all answers 1.32
Overall avg binary Likert for all answers: 0.87


In [30]:
impl_gpt3_knowl_df = gpt3_knowl_df[gpt3_knowl_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_gpt3_knowl_df, cache)

Overall avg Likert for all answers 1.24
Overall avg binary Likert for all answers: 0.85


In [31]:
evaluate_df_by_onto(impl_gpt3_knowl_df, cache)

Consequence
Overall avg Likert for all answers 1.26
Overall avg binary Likert for all answers: 0.85
Goal seeking
Overall avg Likert for all answers 1.3
Overall avg binary Likert for all answers: 0.9
Reactionary
Overall avg Likert for all answers 1.2
Overall avg binary Likert for all answers: 0.83
Desire
Overall avg Likert for all answers 1.23
Overall avg binary Likert for all answers: 0.85
Other
Overall avg Likert for all answers 1.16
Overall avg binary Likert for all answers: 0.82


# Model Setup

## Base

In [32]:
print('Gtup top3')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_tup_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gtup top3
Overall avg Likert for all answers 0.52
Overall avg binary Likert for all answers: 0.59


In [33]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.08
Overall avg binary Likert for all answers: 0.45


In [34]:
print('Gtupsep top3')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_tupsep_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gtupsep top3
Overall avg Likert for all answers 0.52
Overall avg binary Likert for all answers: 0.6


In [35]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.1
Overall avg binary Likert for all answers: 0.46


In [36]:
print('Gverb. top1')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_verb_top1_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top1
Overall avg Likert for all answers 0.88
Overall avg binary Likert for all answers: 0.72


In [37]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.51
Overall avg binary Likert for all answers: 0.59


In [38]:
print('Gverb. top5 diverse')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_verb_top5_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top5 diverse
Overall avg Likert for all answers 0.91
Overall avg binary Likert for all answers: 0.73


In [39]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.56
Overall avg binary Likert for all answers: 0.61


In [40]:
print('Gverb. top3 original')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_verb_top3_original.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 original
Overall avg Likert for all answers 0.75
Overall avg binary Likert for all answers: 0.67


In [41]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.26
Overall avg binary Likert for all answers: 0.5


In [42]:
print('Gverb. top3 diverse')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_verb_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 diverse
Overall avg Likert for all answers 0.84
Overall avg binary Likert for all answers: 0.7


In [43]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.47
Overall avg binary Likert for all answers: 0.58


In [44]:
print('Gverb. top3 reranked')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base_verb_top3_reranked.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 reranked
Overall avg Likert for all answers 0.88
Overall avg binary Likert for all answers: 0.71


In [45]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.59
Overall avg binary Likert for all answers: 0.62


In [46]:
print('T5 Appendix D.3 format - no separator')
t5_df = pd.read_csv('../data/camera-ready-predictions/t5base.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

T5 Appendix D.3 format - no separator
Overall avg Likert for all answers 0.36
Overall avg binary Likert for all answers: 0.56


In [47]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers -0.27
Overall avg binary Likert for all answers: 0.34


## 11B

In [48]:
print('Gtup top3')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_tup_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gtup top3
Overall avg Likert for all answers 1.24
Overall avg binary Likert for all answers: 0.84


In [49]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.97
Overall avg binary Likert for all answers: 0.72


In [50]:
print('Gtupsep top3')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_tupsep_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gtupsep top3
Overall avg Likert for all answers 1.11
Overall avg binary Likert for all answers: 0.79


In [51]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.81
Overall avg binary Likert for all answers: 0.68


In [52]:
print('Gverb. top1')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top1_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top1
Overall avg Likert for all answers 1.24
Overall avg binary Likert for all answers: 0.84


In [53]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.95
Overall avg binary Likert for all answers: 0.71


In [54]:
print('Gverb. top5 diverse')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top5_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top5 diverse
Overall avg Likert for all answers 1.25
Overall avg binary Likert for all answers: 0.85


In [55]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 1.05
Overall avg binary Likert for all answers: 0.78


In [56]:
print('Gverb. top3 original')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top3_original.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 original
Overall avg Likert for all answers 1.24
Overall avg binary Likert for all answers: 0.84


In [57]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 1.01
Overall avg binary Likert for all answers: 0.74


In [58]:
print('Gverb. top3 diverse')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 diverse
Overall avg Likert for all answers 1.27
Overall avg binary Likert for all answers: 0.85


In [59]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 1.04
Overall avg binary Likert for all answers: 0.77


In [60]:
print('Gverb. top3 reranked')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top3_reranked.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 reranked
Overall avg Likert for all answers 1.23
Overall avg binary Likert for all answers: 0.83


In [61]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.99
Overall avg binary Likert for all answers: 0.73


In [62]:
print('T5 Appendix D.3 format - no separator')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

T5 Appendix D.3 format - no separator
Overall avg Likert for all answers 0.99
Overall avg binary Likert for all answers: 0.77


In [63]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.6
Overall avg binary Likert for all answers: 0.62
