In [1]:
import os
os.environ['HF_HOME'] = '/data1/malto/cache'

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Trainer,
)
import pandas as pd
import torch
import evaluate
import os

BASE_DIR = "/data1/malto/fborra/fig"
BATCH_SIZE = 64
NUM_EPOCHS = 7
N_GEN = 50
base_checkpoint = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(base_checkpoint)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ds = load_dataset("ColumbiaNLP/FLUTE").shuffle(seed=42)
df = pd.read_csv("complete_dataset.csv").fillna("")
ds = Dataset.from_pandas(df).shuffle(seed=42)
folds = StratifiedKFold(n_splits=10, shuffle=False)
splits = folds.split(ds, ds["label"])
indexes = [t for t in splits]

In [3]:
from flute_dream import add_combined_cols

ds = ds.map(add_combined_cols)

Map:   0%|          | 0/7534 [00:00<?, ? examples/s]

In [4]:
def preprocess_dataset_s1(examples):
    model_inputs = tokenizer(examples["premise_hypothesis"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s2(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_system_2"])
    labels = tokenizer(examples["type_label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s31(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_emotion"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s32(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_motivation"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s33(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_consequence"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s34(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_rot"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s35(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_all_dims"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s41(examples):
    model_inputs = tokenizer(examples["premise_hypothesis"])
    labels = tokenizer(examples["label"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s42(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_label"])
    labels = tokenizer(examples["explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s7(examples):
    lbl_exp = [
        ex[ex.find("Explanation: ") :] + ex[: ex.find("Explanation: ")]
        for ex in examples["label_explanation"]
    ]
    model_inputs = tokenizer(examples["premise_hypothesis"])
    labels = tokenizer(lbl_exp)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def get_path(name):
    return f"{BASE_DIR}/{name}/{os.listdir(BASE_DIR + '/' + name)[0]}"


def get_prem_hyp(s):
    ind = s.find("Hypothesis: ")
    prem = s[len("Premise: ") : ind]
    hyp = s[ind + len("Hypothesis: ") :]
    return [prem, hyp]


"""Class encapsulating the two steps of System 4 (Classify, then Explain)"""


class DREAM_FLUTE_System4:
    def __init__(
        self, tokenizer=None, model_s41_path=None, model_s42_path=None
    ) -> None:
        self.tokenizer = (
            tokenizer
            if tokenizer is not None
            else AutoTokenizer.from_pretrained("t5-small")
        )
        self.model_s41 = (
            AutoModelForSeq2SeqLM.from_pretrained(model_s41_path)
            if model_s41_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S4-Classify"
            )
        )
        self.model_s42 = (
            AutoModelForSeq2SeqLM.from_pretrained(model_s42_path)
            if model_s42_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S4-Explain"
            )
        )

    """Expected input for function : "Premise: ... . Hypothesis: ... . Is there a contradiction or entailment between the premise and hypothesis ?" 
    Or list of strings in this format"""

    def predict(self, inputs):
        conc_inp = [in1 + in2 for in1, in2 in inputs]
        self.model_s41 = self.model_s41.to(device)
        self.model_s42 = self.model_s42.to(device)
        tok_input = self.tokenizer(conc_inp, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        with torch.no_grad():
            output_model_1 = self.model_s41.generate(tok_input, max_new_tokens=100)
        labels = ["Label: " + el for el in self.tokenizer.batch_decode(output_model_1, skip_special_tokens=True)]
        s1 = "Is there a contradiction or entailment between the premise and hypothesis ?"
        s2 = ". What is the explanation of the label associated to the premise and the hypothesis ?"
        intermediate_input = [inp[:inp.find(s1)] + lbl + s2 for inp, lbl in zip(conc_inp, labels)]
        tok_intermediate_input = self.tokenizer(intermediate_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        with torch.no_grad():
            output_model_2 = self.model_s42.generate(tok_intermediate_input, max_new_tokens=100)
        explanations = self.tokenizer.batch_decode(output_model_2, skip_special_tokens=True)
        return [lbl + ". Explanation: " + expl for lbl, expl in zip(labels, explanations)]
    

"""Ensemble class that loads all models from HuggingFace (or from the device if a path to the model is indicated) 
and implements the ensembling algorithm given in the DREAM-FLUTE paper"""


class DREAM_FLUTE_Ensemble:
    def __init__(
        self,
        tokenizer_path=None,
        s1_path=None,
        s2_path=None,
        s3_emo_path=None,
        s3_mot_path=None,
        s3_cons_path=None,
        s3_rot_path=None,
        s3_alldims_path=None,
        s4_clas_path=None,
        s4_exp_path=None,
        dream_path=None,
    ) -> None:
        self.tokenizer = (
            AutoTokenizer.from_pretrained(tokenizer_path)
            if tokenizer_path is not None
            else AutoTokenizer.from_pretrained("t5-small")
        )
        self.model_s1 = (
            AutoModelForSeq2SeqLM.from_pretrained(s1_path)
            if s1_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("YoanBOUTE/DREAM-FLUTE-S1")
        )
        self.model_s2 = (
            AutoModelForSeq2SeqLM.from_pretrained(s2_path)
            if s2_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("YoanBOUTE/DREAM-FLUTE-S2")
        )
        self.model_s3_emo = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_emo_path)
            if s3_emo_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-Emotion"
            )
        )
        self.model_s3_mot = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_mot_path)
            if s3_mot_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-Motivation"
            )
        )
        self.model_s3_cons = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_cons_path)
            if s3_cons_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-Consequence"
            )
        )
        self.model_s3_rot = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_rot_path)
            if s3_rot_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("YoanBOUTE/DREAM-FLUTE-S3-ROT")
        )
        self.model_s3_alldims = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_alldims_path)
            if s3_alldims_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-AllDims"
            )
        )
        self.model_s4 = DREAM_FLUTE_System4(self.tokenizer, s4_clas_path, s4_exp_path)
        self.model_dream = (
            AutoModelForSeq2SeqLM.from_pretrained(dream_path)
            if dream_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("RicoBorra/DREAM-t5-small")
        )

    """Tokenizes the input, then feeds it to the given model, and decodes the output to have a string as result.
    This method is callable for all models except System 4 (Use the method defined in the class of System 4)"""

    def _prediction_pipeline(self, inputs: str, model) -> str:
        tokenized_input = self.tokenizer(inputs, return_tensors="pt", truncation=True, padding=True).input_ids.to(device)
        model = model.to(device)
        model_output = model.generate(tokenized_input, max_new_tokens=100)
        decoded_output = self.tokenizer.batch_decode(model_output, skip_special_tokens=True)
        return decoded_output

    def _get_batched_predictions(self, inputs: list):
        prems, hyps = zip(*inputs)
        prems = [prem.strip() for prem in prems]
        hyps = [hyp.strip() for hyp in hyps]
        prems = [prem + "." if not prem.endswith(".") else prem for prem in prems]
        hyps = [hyp + "." if not hyp.endswith(".") else hyp for hyp in hyps]

        predictions = dict()

        input_1 = [f"Premise: {prem} Hypothesis: {hyp} Is there a contradiction or entailment between the premise and hypothesis ?" for prem, hyp in zip(prems, hyps)]
        predictions["S1"] = self._prediction_pipeline(input_1, self.model_s1)

        input_2 = [f"Premise: {prem} Hypothesis: {hyp} What is the type of figurative language involved? Is there a contradiction or entailment between the premise and hypothesis ?" for prem, hyp in zip(prems, hyps)]
        predictions["S2"] = self._prediction_pipeline(input_2, self.model_s2)

        # DREAM elaborations for system 3
        input_dream_prems = [f"[SITUATION] {prem} [QUERY] " for prem in prems]
        input_dream_hyps = [f"[SITUATION] {hyp} [QUERY] " for hyp in hyps]
        prem_elaborations = {key: self._prediction_pipeline([idp + key for idp in input_dream_prems], self.model_dream) for key in ["emotion", "motivation", "consequence", "rot"]}
        for key, elabs in prem_elaborations.items():
            elabs = [elab.strip() for elab in elabs]
            elabs = [elab + "." if not elab.endswith(".") else elab for elab in elabs]
            prem_elaborations[key] = elabs
        hyp_elaborations = {key: self._prediction_pipeline([idh + key for idh in input_dream_hyps], self.model_dream) for key in ["emotion", "motivation", "consequence", "rot"]}
        for key, elabs in hyp_elaborations.items():
            elabs = [elab.strip() for elab in elabs]
            elabs = [elab + "." if not elab.endswith(".") else elab for elab in elabs]
            hyp_elaborations[key] = elabs

        input_3_emo = []
        for i in range(len(inputs)):
            input_3_emo.append(f"Premise: {prems[i]} [Emotion] {prem_elaborations['emotion'][i]} Hypothesis: {hyps[i]} [Emotion] {hyp_elaborations['emotion'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-emo"] = self._prediction_pipeline(input_3_emo, self.model_s3_emo)

        input_3_mot = []
        for i in range(len(inputs)):
            input_3_mot.append(f"Premise: {prems[i]} [Motivation] {prem_elaborations['motivation'][i]} Hypothesis: {hyps[i]} [Motivation] {hyp_elaborations['motivation'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-mot"] = self._prediction_pipeline(input_3_mot, self.model_s3_mot)

        input_3_cons = []
        for i in range(len(inputs)):
            input_3_cons.append(f"Premise: {prems[i]} [Consequence] {prem_elaborations['consequence'][i]} Hypothesis: {hyps[i]} [Consequence] {hyp_elaborations['consequence'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-cons"] = self._prediction_pipeline(input_3_cons, self.model_s3_cons)

        input_3_rot = []
        for i in range(len(inputs)):
            input_3_rot.append(f"Premise: {prems[i]} [Rot] {prem_elaborations['rot'][i]} Hypothesis: {hyps[i]} [Rot] {hyp_elaborations['rot'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-rot"] = self._prediction_pipeline(input_3_rot, self.model_s3_rot)

        input_3_all = []
        for prem, hyp in zip(prems, hyps):
            el = f"Premise: {prem} "
            for key, elab in prem_elaborations.items():
                el += f"[{key.capitalize()}] {elab} "
            el += f"Hypothesis: {hyp} "
            for key, elab in hyp_elaborations.items():
                el += f"[{key.capitalize()}] {elab} "
            el += "Is there a contradiction or entailment between the premise and hypothesis ?"
            input_3_all.append(el)
        predictions["S3-all"] = self._prediction_pipeline(input_3_all, self.model_s3_alldims)

        # The input for system 4 is in the same format as for system 1
        predictions["S4"] = self.model_s4.predict([[in1, ""] for in1 in input_1])

        return predictions

    """Uses the predictions from each model to compute the final prediction of the ensemble"""

    def _ensemble_algorithm(self, model_outputs):
        # Firstly, the label is selected based on the majority between the 5 best models (according to the paper : systems 1, 2, 3-motivation, 3-alldims, 4)
        labels = [
            model_outputs[key].split(".")[0]
            for key in ["S1", "S2", "S3-mot", "S3-all", "S4"]
        ]
        # Sometimes, it might happen with the small models that the generated label is a mix of words, like 'Contratailment' or 'Endiction'
        for label in labels:
            if label not in ["Label: Contradiction", "Label: Entailment"]:
                labels.remove(label)
        unique, counts = np.unique(labels, return_counts=True)
        ix = np.argmax(counts)
        major_label = unique[ix]

        # Then, pick the explanation of the first system agreeing with the majority label, following an order indicated in the paper
        for key in ["S3-cons", "S3-emo", "S2", "S3-all", "S3-mot", "S4", "S1"]:
            substrings = model_outputs[key].split(".")
            label = substrings[0]
            explanation = substrings[1]

            if label == major_label:
                break

        return major_label + "." + explanation + "."

    """Expected input : [Premise_sentence, hypothesis_sentence] or list of inputs"""

    def predict(self, inputs):
        preds_dict = self._get_batched_predictions(inputs)
        preds = [{k: preds_dict[k][i] for k in preds_dict} for i in range(len(inputs))]
        final_pred = [self._ensemble_algorithm(pred) for pred in preds]

        return final_pred

In [5]:
operating_modes = [
    ("system_1", preprocess_dataset_s1),
    ("system_2", preprocess_dataset_s2),
    ("system_31", preprocess_dataset_s31),
    ("system_32", preprocess_dataset_s32),
    ("system_33", preprocess_dataset_s33),
    ("system_34", preprocess_dataset_s34),
    ("system_35", preprocess_dataset_s35),
    ("system_4", preprocess_dataset_s1),
    ("system_5", preprocess_dataset_s1),
    ("system_6", preprocess_dataset_s1),
    ("system_7", preprocess_dataset_s7),
]

In [6]:
def train_model(name, model, curr_ds, num_epochs=NUM_EPOCHS):
    training_args = Seq2SeqTrainingArguments(
        output_dir=f"{BASE_DIR}/{name}",
        learning_rate=3e-4,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=2 * BATCH_SIZE,
        save_total_limit=1,
        num_train_epochs=num_epochs,
        report_to="none",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        eval_accumulation_steps=1,
        logging_steps=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=curr_ds["train"],
        eval_dataset=curr_ds["val"].select(range(350)),
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    )

    trainer.train()

    return trainer

In [7]:
records = []

## Train + Evaluate

In [8]:
from IPython.display import clear_output

modes = {t: {name: [] for name, _ in operating_modes} for t in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']}

for train_idxs, val_idxs in indexes[7:]:
    fold_dataset = DatasetDict(
        {"train": ds.select(train_idxs), "val": ds.select(val_idxs)}
    )

    for name, preprocess_func in operating_modes:
        curr_ds = fold_dataset.map(preprocess_func, batched=True).remove_columns(fold_dataset["train"].column_names)

        if name == "system_4":
            ds_41 = fold_dataset.map(preprocess_dataset_s41, batched=True).remove_columns(fold_dataset["train"].column_names)
            train_model("system_41", AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint), ds_41, 2)
            ds_42 = fold_dataset.map(preprocess_dataset_s42, batched=True).remove_columns(fold_dataset["train"].column_names)
            train_model("system_42", AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint), ds_42, 8)
        elif name == "system_5":
            pass
        else:
            if name == "system_6":
                model = AutoModelForSeq2SeqLM.from_pretrained("RicoBorra/T5-small-synthetic-FLUTE")
            else:
                model = AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint)
            trainer = train_model(name, model, curr_ds)

        # have to do batched rouge computation otherwise not enough memory
        rouge = evaluate.load("rouge")
        metrics = {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0}
        count = 0
        for i in range(0, len(curr_ds["val"]), N_GEN):
            count += 1
            if (name != "system_4") and (name != "system_5"):
                (predictions, _), label_ids, _ = trainer.predict(test_dataset=curr_ds["val"].select(range(i, min(i + N_GEN, len(curr_ds["val"])))))
                # delete stuff after EOS token
                predicted_token_ids = torch.argmax(torch.from_numpy(predictions), dim=-1)
                for j in range(predicted_token_ids.shape[0]):
                    ind = (predicted_token_ids[j] == 1).nonzero(as_tuple=True)[0]
                    if ind.numel() != 0:
                        predicted_token_ids[j, ind[0] :] = 1
                decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
                # clean decoded preds if needed
                if name == "system_2":
                    decoded_preds = [dp[dp.find("Label") :] for dp in decoded_preds]
                if name == "system_7":
                    decoded_preds = [ex[ex.find("Label: ") :] + " " + ex[: ex.find("Label: ")] for ex in decoded_preds]
            else:
                small_ds = curr_ds["val"].select(range(i, min(i + N_GEN, len(curr_ds["val"]))))
                label_ids = small_ds["labels"]
                max_len = max([len(el) for el in label_ids])
                label_ids = [el + (max_len - len(el)) * [0] for el in label_ids]
                inputs = tokenizer.batch_decode(small_ds["input_ids"], skip_special_tokens=True)
                if name == "system_4":
                    sys_4 = DREAM_FLUTE_System4(
                        tokenizer=None,
                        model_s41_path=get_path("system_41"),
                        model_s42_path=get_path("system_42"),
                    )
                    decoded_preds = sys_4.predict(inputs=[[in1, ""] for in1 in inputs])
                else:  # "name == system_5"
                    inputs = [get_prem_hyp(ex) for ex in inputs]
                    sys_5 = DREAM_FLUTE_Ensemble(
                        tokenizer_path=None,
                        s1_path=get_path("system_1"),
                        s2_path=get_path("system_2"),
                        s3_emo_path=get_path("system_31"),
                        s3_mot_path=get_path("system_32"),
                        s3_cons_path=get_path("system_33"),
                        s3_rot_path=get_path("system_34"),
                        s3_alldims_path=get_path("system_35"),
                        s4_clas_path=get_path("system_41"),
                        s4_exp_path=get_path("system_42"),
                        dream_path=None,
                    )
                    decoded_preds = sys_5.predict(inputs=inputs)

            # careful here
            labels = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            if name == "system_2":
                decoded_labels = [dp[dp.find("Label") :] for dp in decoded_labels]
            if name == "system_7":
                decoded_labels = [ex[ex.find("Label: ") :] + " " + ex[: ex.find("Label: ")] for ex in decoded_labels]
            new_metrics = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            for k in new_metrics:
                metrics[k] += new_metrics[k]

        if name == "system_4":
            del sys_4
        if name == "system_5":
            del sys_5
        if (name != "system_4") and (name != "system_5"):
            del trainer

        for k in metrics:
            metrics[k] /= count

        clear_output(wait=True)
        for k in metrics:
            modes[k][name].append(metrics[k])
        print(modes)
        # print(name, decoded_preds[0], decoded_labels[0])
        records.append({"name": name, "content": decoded_preds[0]})
        # print(predicted_token_ids[0], labels[0])

{'rouge1': {'system_1': [0.6246878094875561, 0.6341441822703994, 0.6274991372461587], 'system_2': [0.624626367838713, 0.6362783206601219, 0.6354809384039172], 'system_31': [0.6220648604882084, 0.6300350744143502, 0.6260482827509799], 'system_32': [0.6252454148480854, 0.6318133565880665, 0.627464216395424], 'system_33': [0.6227111426025169, 0.6346048027312454, 0.6259685606548056], 'system_34': [0.6214302948230429, 0.6308764223358232, 0.6264845821708296], 'system_35': [0.6254565678340289, 0.6345294181286941, 0.6264105558628021], 'system_4': [0.5062272076188992, 0.5020769021769059, 0.5118406101470804], 'system_5': [0.4826523491171023, 0.48991883382314916, 0.5023352266100427], 'system_6': [0.644827581196672, 0.643758405662465, 0.6486185437193592], 'system_7': [0.6313350172884785, 0.6369078181311033, 0.6374436402782385]}, 'rouge2': {'system_1': [0.3664416264089301, 0.3712405815927888, 0.3682803628229483], 'system_2': [0.3611705697191213, 0.3774227651927847, 0.37941467463046064], 'system_31'

## Examples

In [9]:
tokenizer.decode(curr_ds['val'][i]['input_ids'])

'Premise: They were really disappointed and thus cried a lot. Hypothesis: Then they cried a sea of tears. Is there a contradiction or entailment between the premise and hypothesis?</s>'

In [10]:
records.append({"name": "ground_truth", "content": decoded_labels[0]})
records[-12:]

[{'name': 'system_1',
  'content': 'Label: Entailment. Explanation: A sea of tears is that werecried a lot,'},
 {'name': 'system_2',
  'content': 'Label: Entailment. Explanation: A sea of tears is that werecried a lot,'},
 {'name': 'system_31',
  'content': 'Label: Contratailment. Explanation: A sea of tears is that werecried a lot,'},
 {'name': 'system_32',
  'content': 'Label: Contratailment. Explanation: A sea of tears is that werecried a lot,'},
 {'name': 'system_33',
  'content': 'Label: Contratailment. Explanation: To sea of tears is that werecried a lot,'},
 {'name': 'system_34',
  'content': 'Label: Contratailment. Explanation: A sea of tears is that werecried a lot,'},
 {'name': 'system_35',
  'content': 'Label: Contratailment. Explanation: A sea of tears is that werecried a lot,'},
 {'name': 'system_4',
  'content': 'Label: Entailment. Explanation: A sea of tears is a very sad and sad experience, so the entailment is that they were disappointed and thus cried a lot.'},
 {'nam

In [42]:
l = [{'rouge1': {'system_1': [0.6287601003862089, 0.633608795881407, 0.6326245326247466, 0.6359633208729055], 'system_2': [0.6321762360161955, 0.6325924906185946, 0.6349583752497027, 0.6370928735898586], 'system_31': [0.6252684861050168, 0.6287479980898582, 0.6254257186251064, 0.6359014185096944], 'system_32': [0.6227325635158591, 0.6284970078568923, 0.6260437036629444, 0.6331590632121954], 'system_33': [0.6220287341673076, 0.6337049542932675, 0.6283826160785351, 0.6409376689905746], 'system_34': [0.6285204428147866, 0.6284956350109303, 0.6279062790456762, 0.6351394258874956], 'system_35': [0.6261767391985277, 0.6285666524223309, 0.6262204939813054, 0.6420310670036855], 'system_4': [0.4987157818476132, 0.5182389569456386, 0.5072293647328896, 0.5126676980623377], 'system_5': [0.4816794029394727, 0.4905344931133778, 0.49090627925497504, 0.4968458603236464], 'system_6': [0.6424101841590819, 0.6470189389309196, 0.6428308097005095, 0.6510190306355902], 'system_7': [0.6324059287064632, 0.6391049881943325, 0.6353492011974757, 0.6464589585350855]}, 'rouge2': {'system_1': [0.366722728178428, 0.3779535604765535, 0.37426661026748115, 0.3713801665909031], 'system_2': [0.3734527097400099, 0.3775447202456451, 0.3797095473370223, 0.3719168021378222], 'system_31': [0.36213480417270916, 0.36667956426466847, 0.3667457535604975, 0.36908219851360957], 'system_32': [0.3626127732883249, 0.3662212747378505, 0.36718043524563065, 0.36498176984702313], 'system_33': [0.35470679399261645, 0.3751899368323427, 0.36722002291516076, 0.3764078604879013], 'system_34': [0.36477529783674784, 0.36952227254879033, 0.36179103225623577, 0.3665042593059199], 'system_35': [0.3654354130425697, 0.3695895189657151, 0.36943702892833014, 0.3749102823438876], 'system_4': [0.27160900554873496, 0.287755701928143, 0.2786986414527335, 0.2885756666202249], 'system_5': [0.24904353732729848, 0.2653486817457688, 0.261210130605974, 0.2677654983476413], 'system_6': [0.3874841616865552, 0.3990846120404418, 0.3930696227164031, 0.3995819287018828], 'system_7': [0.370641728688517, 0.3849958149109144, 0.37556007498719357, 0.38611663742947094]}, 'rougeL': {'system_1': [0.5937647917769434, 0.6022469712464507, 0.598277066525734, 0.6024678613122201], 'system_2': [0.5982454075559702, 0.6013652345145185, 0.6010545703192718, 0.6043261855741717], 'system_31': [0.5897076314457856, 0.5964938037307009, 0.5916588883367806, 0.603860271636886], 'system_32': [0.5885703082990177, 0.5970361423585644, 0.5930345037743158, 0.6021614475828464], 'system_33': [0.5862009601703836, 0.5996523558819924, 0.5936712524208784, 0.6064491239791959], 'system_34': [0.5934037841204706, 0.5969519857463006, 0.5937523781960504, 0.6015791572015099], 'system_35': [0.5909041457096427, 0.5970652653376252, 0.5949275337957723, 0.609378233470345], 'system_4': [0.4265507989464661, 0.4464415816810788, 0.436819506869001, 0.44651364079850586], 'system_5': [0.40694851054281983, 0.41619936693556653, 0.41690136214488116, 0.4277333659090989], 'system_6': [0.6105283711516876, 0.6180166840348906, 0.6139627587880736, 0.621954703219382], 'system_7': [0.5911489097227227, 0.6020951413836534, 0.5952087295919954, 0.6082427572827123]}, 'rougeLsum': {'system_1': [0.5936802444692018, 0.602275572378925, 0.5980022905225342, 0.6020612606643976], 'system_2': [0.5981067330200075, 0.6015137217046728, 0.6010074825534009, 0.6040741060496922], 'system_31': [0.5897807858718463, 0.5965412725886905, 0.5917132581740361, 0.6037339257156672], 'system_32': [0.5888300327083514, 0.5970839654452716, 0.5928010153323514, 0.6019709183801375], 'system_33': [0.5859989510368934, 0.5995703446176975, 0.5937112435174209, 0.6064841176939894], 'system_34': [0.5932893976160784, 0.5969178415050882, 0.5934974154999059, 0.6012418506470679], 'system_35': [0.5908572226150147, 0.5970970285399859, 0.5949223162395134, 0.6093904942752214], 'system_4': [0.42665386310064535, 0.4467368645403539, 0.4369731724826026, 0.44616742968841], 'system_5': [0.4067795208297238, 0.41626327803809643, 0.41680094045553845, 0.4273509250371629], 'system_6': [0.6104224621227644, 0.6180937228521529, 0.6139634680396846, 0.6218027324597369], 'system_7': [0.5909423818063468, 0.6020931876839676, 0.5949700639114215, 0.608187846066667]}},
{'rouge1': {'system_1': [0.6369103844306321, 0.6328006678071453, 0.6274612564190746], 'system_2': [0.6416113839275822, 0.634377422798425, 0.6263296707635586], 'system_31': [0.6347884428292104, 0.6279810883643742, 0.6201299531010094], 'system_32': [0.6297464396281792, 0.6234853578029658, 0.6187048470040165], 'system_33': [0.6358334591695605, 0.631011877750443, 0.620365778338396], 'system_34': [0.6351919703723758, 0.6327002342868852, 0.6191225068024896], 'system_35': [0.641239116239296, 0.6313603947473296, 0.617818623721891], 'system_4': [0.5159443164489308, 0.49851277681693035, 0.4895436644252909], 'system_5': [0.5034900787622951, 0.491114337321584, 0.48546406128399006], 'system_6': [0.6446946871427579, 0.6438637569516972, 0.6357534095152252], 'system_7': [0.6437452287907135, 0.6417058719479158, 0.6310622112169739]}, 'rouge2': {'system_1': [0.3772994206743473, 0.3598730607530899, 0.3624861153823348], 'system_2': [0.3813299762454619, 0.36833771827941975, 0.36182396229908975], 'system_31': [0.3715147825221117, 0.3544431410639804, 0.3527763972099442], 'system_32': [0.3597483289253308, 0.3551003260894073, 0.3507477187174731], 'system_33': [0.3671870941883696, 0.3616625114533587, 0.3483420593756244], 'system_34': [0.3698184486078828, 0.3606281379224917, 0.3477009883149291], 'system_35': [0.3796411935621023, 0.35833648068004303, 0.34999243905725175], 'system_4': [0.2923132821041788, 0.2652345460034492, 0.25807420748703297], 'system_5': [0.2698217319625177, 0.25320977582484666, 0.2558602866722933], 'system_6': [0.38708354443673393, 0.38659972043798396, 0.3781310260413493], 'system_7': [0.3862627035332788, 0.3762742569483144, 0.36656950398227367]}, 'rougeL': {'system_1': [0.6011969011933326, 0.5957262325821415, 0.5939217864802092], 'system_2': [0.6061045887277551, 0.5977509686067606, 0.5927475206727556], 'system_31': [0.5990035020238528, 0.5912327522902248, 0.586517654969315], 'system_32': [0.5945914522827086, 0.5885254981447267, 0.5859044646198465], 'system_33': [0.5996295383243999, 0.5945534242474153, 0.5870167902793015], 'system_34': [0.5998658443025972, 0.5973517275973438, 0.585667928042467], 'system_35': [0.605256876883596, 0.594195807770365, 0.5832715663278305], 'system_4': [0.44583808394556507, 0.430583499886202, 0.4218363240807499], 'system_5': [0.43407968735723795, 0.41927005366211384, 0.41355942986901983], 'system_6': [0.612445418797352, 0.6124627577417818, 0.6045937009964478], 'system_7': [0.6023165430093183, 0.5963609453901584, 0.592602458564544]}, 'rougeLsum': {'system_1': [0.601139403820665, 0.5956253985006844, 0.5938228872724967], 'system_2': [0.6060203420116423, 0.5977355080399951, 0.5928882570452056], 'system_31': [0.5990313049536441, 0.5909763561356894, 0.586571888412856], 'system_32': [0.5946899739426622, 0.5882008967598912, 0.5858670549778399], 'system_33': [0.5998687564193838, 0.5942162521164861, 0.5871159608213457], 'system_34': [0.6000057964186237, 0.5973730991453368, 0.5857445019482551], 'system_35': [0.6052210899816826, 0.5938430855095576, 0.583253505550163], 'system_4': [0.44598944296750276, 0.43074204936298344, 0.4216903455127595], 'system_5': [0.4341579225273386, 0.41921345783175495, 0.4132883392359126], 'system_6': [0.6124853201504509, 0.6124007135562005, 0.6047117884439586], 'system_7': [0.6023283087649791, 0.5963472538613532, 0.5927292213203716]}},
{'rouge1': {'system_1': [0.6246878094875561, 0.6341441822703994, 0.6274991372461587], 'system_2': [0.624626367838713, 0.6362783206601219, 0.6354809384039172], 'system_31': [0.6220648604882084, 0.6300350744143502, 0.6260482827509799], 'system_32': [0.6252454148480854, 0.6318133565880665, 0.627464216395424], 'system_33': [0.6227111426025169, 0.6346048027312454, 0.6259685606548056], 'system_34': [0.6214302948230429, 0.6308764223358232, 0.6264845821708296], 'system_35': [0.6254565678340289, 0.6345294181286941, 0.6264105558628021], 'system_4': [0.5062272076188992, 0.5020769021769059, 0.5118406101470804], 'system_5': [0.4826523491171023, 0.48991883382314916, 0.5023352266100427], 'system_6': [0.644827581196672, 0.643758405662465, 0.6486185437193592], 'system_7': [0.6313350172884785, 0.6369078181311033, 0.6374436402782385]}, 'rouge2': {'system_1': [0.3664416264089301, 0.3712405815927888, 0.3682803628229483], 'system_2': [0.3611705697191213, 0.3774227651927847, 0.37941467463046064], 'system_31': [0.3581175207476469, 0.3618645369852031, 0.36046831601997714], 'system_32': [0.3600245892816643, 0.3658657700226036, 0.3591370684643754], 'system_33': [0.3537560452839148, 0.3676212314907734, 0.3581650945407853], 'system_34': [0.34971118017803154, 0.3612727157933159, 0.3583893375903354], 'system_35': [0.3627048855561921, 0.3697656763063377, 0.3637476472950141], 'system_4': [0.2769482345027149, 0.2697279883199517, 0.28153097880173333], 'system_5': [0.25264042099494116, 0.25806847608677513, 0.26862913588462867], 'system_6': [0.38802328758706406, 0.38341349694632215, 0.39445953568171227], 'system_7': [0.3708638456521669, 0.376473632944887, 0.38077604663344755]}, 'rougeL': {'system_1': [0.5910999905976965, 0.5989011573295575, 0.5957896749554574], 'system_2': [0.5906174197016779, 0.6025233816049732, 0.6032043151593672], 'system_31': [0.5865725319145447, 0.59427766178537, 0.5932414428322146], 'system_32': [0.5916026838730599, 0.5962983728562639, 0.5941042440696599], 'system_33': [0.5882832614912541, 0.599503114972065, 0.594188119754569], 'system_34': [0.5861111362161271, 0.5957065143551333, 0.5935046589895423], 'system_35': [0.5916788017390664, 0.6001195363270913, 0.5950382123157152], 'system_4': [0.43149021728864995, 0.43233119309028584, 0.4432197646237495], 'system_5': [0.4062111560774062, 0.41770717791362777, 0.4315069418159273], 'system_6': [0.6119326186700795, 0.6097026243240721, 0.6154584578847417], 'system_7': [0.5897012480781769, 0.5966671920280665, 0.6001297772985535]}, 'rougeLsum': {'system_1': [0.5911310411599533, 0.5990124470917675, 0.5952783012718421], 'system_2': [0.5908888275750752, 0.6026718921073135, 0.6030122272047069], 'system_31': [0.5866883337207177, 0.5945823738605258, 0.5929320044397541], 'system_32': [0.5915945086533169, 0.5967656239933345, 0.5939167897285027], 'system_33': [0.5884882427899627, 0.5997469142669043, 0.5938326902971658], 'system_34': [0.5862423817633475, 0.5959579316141378, 0.5931155022716483], 'system_35': [0.5917674788969137, 0.6004841051836198, 0.5945351095312911], 'system_4': [0.43144287591633057, 0.4326217517063035, 0.44295494107662337], 'system_5': [0.4061211561376144, 0.41769475645614373, 0.4312710379484257], 'system_6': [0.61218696437029, 0.6100260807898317, 0.6152167329286842], 'system_7': [0.5898965921737854, 0.5970389933433461, 0.6000557629735366]}}]

In [43]:
d = {t: {name: [] for name, _ in operating_modes} for t in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']}
for metr in l[0].keys():
    for key in l[0][metr].keys():
        for el in l:
            d[metr][key].extend(el[metr][key])

In [53]:
import pickle

with open("modes.pkl", "wb") as f:
    pickle.dump(modes, f)

In [51]:
modes

{'rouge1': {'system_1': [0.6287601003862089,
   0.633608795881407,
   0.6326245326247466,
   0.6359633208729055,
   0.6369103844306321,
   0.6328006678071453,
   0.6274612564190746,
   0.6246878094875561,
   0.6341441822703994,
   0.6274991372461587],
  'system_2': [0.6321762360161955,
   0.6325924906185946,
   0.6349583752497027,
   0.6370928735898586,
   0.6416113839275822,
   0.634377422798425,
   0.6263296707635586,
   0.624626367838713,
   0.6362783206601219,
   0.6354809384039172],
  'system_31': [0.6252684861050168,
   0.6287479980898582,
   0.6254257186251064,
   0.6359014185096944,
   0.6347884428292104,
   0.6279810883643742,
   0.6201299531010094,
   0.6220648604882084,
   0.6300350744143502,
   0.6260482827509799],
  'system_32': [0.6227325635158591,
   0.6284970078568923,
   0.6260437036629444,
   0.6331590632121954,
   0.6297464396281792,
   0.6234853578029658,
   0.6187048470040165,
   0.6252454148480854,
   0.6318133565880665,
   0.627464216395424],
  'system_33': [0.62

In [46]:
from scipy.stats import ttest_rel

# paired t test
for mode in modes:
    print(mode)
    for key in list(modes[mode].keys())[1:]:
        print("---", key, ttest_rel(modes[mode][key], modes[mode]["system_1"], alternative="greater"))

rouge1
--- system_2 TtestResult(statistic=2.4023551028155796, pvalue=0.019872060376728527, df=9)
--- system_31 TtestResult(statistic=-5.103890537158945, pvalue=0.9996791137069262, df=9)
--- system_32 TtestResult(statistic=-4.344860727156201, pvalue=0.9990679071210917, df=9)
--- system_33 TtestResult(statistic=-1.6842077921728864, pvalue=0.9367861668964658, df=9)
--- system_34 TtestResult(statistic=-3.439162860367714, pvalue=0.9962998524421257, df=9)
--- system_35 TtestResult(statistic=-0.9726788143056638, pvalue=0.8219360161829277, df=9)
--- system_4 TtestResult(statistic=-49.83283623928579, pvalue=0.9999999999986764, df=9)
--- system_5 TtestResult(statistic=-70.41479567367517, pvalue=0.9999999999999406, df=9)
--- system_6 TtestResult(statistic=8.875173315003128, pvalue=4.785049516239032e-06, df=9)
--- system_7 TtestResult(statistic=6.560139777459468, pvalue=5.197263473673728e-05, df=9)
rouge2
--- system_2 TtestResult(statistic=2.2994932786975153, pvalue=0.023519187002648313, df=9)
---