In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Trainer,
)
import pandas as pd
import torch
import evaluate
import os

BATCH_SIZE = 16
NUM_EPOCHS = 7
N_GEN = 50
base_checkpoint = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(base_checkpoint)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ds = load_dataset("ColumbiaNLP/FLUTE").shuffle(seed=42)
df = pd.read_csv("complete_dataset.csv").fillna("")
ds = Dataset.from_pandas(df).shuffle(seed=42)
folds = StratifiedKFold(n_splits=10, shuffle=False)
splits = folds.split(ds, ds["label"])
indexes = [t for t in splits]

In [2]:
from flute_dream import add_combined_cols

ds = ds.map(add_combined_cols)

Map:   0%|          | 0/7534 [00:00<?, ? examples/s]

In [3]:
def preprocess_dataset_s1(examples):
    model_inputs = tokenizer(examples["premise_hypothesis"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s2(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_system_2"])
    labels = tokenizer(examples["type_label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s31(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_emotion"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s32(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_motivation"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s33(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_consequence"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s34(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_rot"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s35(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_all_dims"])
    labels = tokenizer(examples["label_explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s41(examples):
    model_inputs = tokenizer(examples["premise_hypothesis"])
    labels = tokenizer(examples["label"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s42(examples):
    model_inputs = tokenizer(examples["premise_hypothesis_label"])
    labels = tokenizer(examples["explanation"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_dataset_s7(examples):
    lbl_exp = [
        ex[ex.find("Explanation: ") :] + ex[: ex.find("Explanation: ")]
        for ex in examples["label_explanation"]
    ]
    model_inputs = tokenizer(examples["premise_hypothesis"])
    labels = tokenizer(lbl_exp)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def get_path(name):
    return f"{name}/{os.listdir(name)[0]}"


def get_prem_hyp(s):
    ind = s.find("Hypothesis: ")
    prem = s[len("Premise: ") : ind]
    hyp = s[ind + len("Hypothesis: ") :]
    return [prem, hyp]


"""Class encapsulating the two steps of System 4 (Classify, then Explain)"""


class DREAM_FLUTE_System4:
    def __init__(
        self, tokenizer=None, model_s41_path=None, model_s42_path=None
    ) -> None:
        self.tokenizer = (
            tokenizer
            if tokenizer is not None
            else AutoTokenizer.from_pretrained("t5-small")
        )
        self.model_s41 = (
            AutoModelForSeq2SeqLM.from_pretrained(model_s41_path)
            if model_s41_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S4-Classify"
            )
        )
        self.model_s42 = (
            AutoModelForSeq2SeqLM.from_pretrained(model_s42_path)
            if model_s42_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S4-Explain"
            )
        )

    """Expected input for function : "Premise: ... . Hypothesis: ... . Is there a contradiction or entailment between the premise and hypothesis ?" 
    Or list of strings in this format"""

    def predict(self, inputs):
        conc_inp = [in1 + in2 for in1, in2 in inputs]
        self.model_s41 = self.model_s41.to(device)
        self.model_s42 = self.model_s42.to(device)
        tok_input = self.tokenizer(conc_inp, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        with torch.no_grad():
            output_model_1 = self.model_s41.generate(tok_input, max_new_tokens=100)
        labels = ["Label: " + el for el in self.tokenizer.batch_decode(output_model_1, skip_special_tokens=True)]
        s1 = "Is there a contradiction or entailment between the premise and hypothesis ?"
        s2 = ". What is the explanation of the label associated to the premise and the hypothesis ?"
        intermediate_input = [inp[:inp.find(s1)] + lbl + s2 for inp, lbl in zip(conc_inp, labels)]
        tok_intermediate_input = self.tokenizer(intermediate_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        with torch.no_grad():
            output_model_2 = self.model_s42.generate(tok_intermediate_input, max_new_tokens=100)
        explanations = self.tokenizer.batch_decode(output_model_2, skip_special_tokens=True)
        return [lbl + ". Explanation: " + expl for lbl, expl in zip(labels, explanations)]
    

"""Ensemble class that loads all models from HuggingFace (or from the device if a path to the model is indicated) 
and implements the ensembling algorithm given in the DREAM-FLUTE paper"""


class DREAM_FLUTE_Ensemble:
    def __init__(
        self,
        tokenizer_path=None,
        s1_path=None,
        s2_path=None,
        s3_emo_path=None,
        s3_mot_path=None,
        s3_cons_path=None,
        s3_rot_path=None,
        s3_alldims_path=None,
        s4_clas_path=None,
        s4_exp_path=None,
        dream_path=None,
    ) -> None:
        self.tokenizer = (
            AutoTokenizer.from_pretrained(tokenizer_path)
            if tokenizer_path is not None
            else AutoTokenizer.from_pretrained("t5-small")
        )
        self.model_s1 = (
            AutoModelForSeq2SeqLM.from_pretrained(s1_path)
            if s1_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("YoanBOUTE/DREAM-FLUTE-S1")
        )
        self.model_s2 = (
            AutoModelForSeq2SeqLM.from_pretrained(s2_path)
            if s2_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("YoanBOUTE/DREAM-FLUTE-S2")
        )
        self.model_s3_emo = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_emo_path)
            if s3_emo_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-Emotion"
            )
        )
        self.model_s3_mot = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_mot_path)
            if s3_mot_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-Motivation"
            )
        )
        self.model_s3_cons = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_cons_path)
            if s3_cons_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-Consequence"
            )
        )
        self.model_s3_rot = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_rot_path)
            if s3_rot_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("YoanBOUTE/DREAM-FLUTE-S3-ROT")
        )
        self.model_s3_alldims = (
            AutoModelForSeq2SeqLM.from_pretrained(s3_alldims_path)
            if s3_alldims_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained(
                "YoanBOUTE/DREAM-FLUTE-S3-AllDims"
            )
        )
        self.model_s4 = DREAM_FLUTE_System4(self.tokenizer, s4_clas_path, s4_exp_path)
        self.model_dream = (
            AutoModelForSeq2SeqLM.from_pretrained(dream_path)
            if dream_path is not None
            else AutoModelForSeq2SeqLM.from_pretrained("RicoBorra/DREAM-t5-small")
        )

    """Tokenizes the input, then feeds it to the given model, and decodes the output to have a string as result.
    This method is callable for all models except System 4 (Use the method defined in the class of System 4)"""

    def _prediction_pipeline(self, inputs: str, model) -> str:
        tokenized_input = self.tokenizer(inputs, return_tensors="pt", truncation=True, padding=True).input_ids.to(device)
        model = model.to(device)
        model_output = model.generate(tokenized_input, max_new_tokens=100)
        decoded_output = self.tokenizer.batch_decode(model_output, skip_special_tokens=True)
        return decoded_output

    def _get_batched_predictions(self, inputs: list):
        prems, hyps = zip(*inputs)
        prems = [prem.strip() for prem in prems]
        hyps = [hyp.strip() for hyp in hyps]
        prems = [prem + "." if not prem.endswith(".") else prem for prem in prems]
        hyps = [hyp + "." if not hyp.endswith(".") else hyp for hyp in hyps]

        predictions = dict()

        input_1 = [f"Premise: {prem} Hypothesis: {hyp} Is there a contradiction or entailment between the premise and hypothesis ?" for prem, hyp in zip(prems, hyps)]
        predictions["S1"] = self._prediction_pipeline(input_1, self.model_s1)

        input_2 = [f"Premise: {prem} Hypothesis: {hyp} What is the type of figurative language involved? Is there a contradiction or entailment between the premise and hypothesis ?" for prem, hyp in zip(prems, hyps)]
        predictions["S2"] = self._prediction_pipeline(input_2, self.model_s2)

        # DREAM elaborations for system 3
        input_dream_prems = [f"[SITUATION] {prem} [QUERY] " for prem in prems]
        input_dream_hyps = [f"[SITUATION] {hyp} [QUERY] " for hyp in hyps]
        prem_elaborations = {key: self._prediction_pipeline([idp + key for idp in input_dream_prems], self.model_dream) for key in ["emotion", "motivation", "consequence", "rot"]}
        for key, elabs in prem_elaborations.items():
            elabs = [elab.strip() for elab in elabs]
            elabs = [elab + "." if not elab.endswith(".") else elab for elab in elabs]
            prem_elaborations[key] = elabs
        hyp_elaborations = {key: self._prediction_pipeline([idh + key for idh in input_dream_hyps], self.model_dream) for key in ["emotion", "motivation", "consequence", "rot"]}
        for key, elabs in hyp_elaborations.items():
            elabs = [elab.strip() for elab in elabs]
            elabs = [elab + "." if not elab.endswith(".") else elab for elab in elabs]
            hyp_elaborations[key] = elabs

        input_3_emo = []
        for i in range(len(inputs)):
            input_3_emo.append(f"Premise: {prems[i]} [Emotion] {prem_elaborations['emotion'][i]} Hypothesis: {hyps[i]} [Emotion] {hyp_elaborations['emotion'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-emo"] = self._prediction_pipeline(input_3_emo, self.model_s3_emo)

        input_3_mot = []
        for i in range(len(inputs)):
            input_3_mot.append(f"Premise: {prems[i]} [Motivation] {prem_elaborations['motivation'][i]} Hypothesis: {hyps[i]} [Motivation] {hyp_elaborations['motivation'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-mot"] = self._prediction_pipeline(input_3_mot, self.model_s3_mot)

        input_3_cons = []
        for i in range(len(inputs)):
            input_3_cons.append(f"Premise: {prems[i]} [Consequence] {prem_elaborations['consequence'][i]} Hypothesis: {hyps[i]} [Consequence] {hyp_elaborations['consequence'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-cons"] = self._prediction_pipeline(input_3_cons, self.model_s3_cons)

        input_3_rot = []
        for i in range(len(inputs)):
            input_3_rot.append(f"Premise: {prems[i]} [Rot] {prem_elaborations['rot'][i]} Hypothesis: {hyps[i]} [Rot] {hyp_elaborations['rot'][i]} Is there a contradiction or entailment between the premise and hypothesis ?")
        predictions["S3-rot"] = self._prediction_pipeline(input_3_rot, self.model_s3_rot)

        input_3_all = []
        for prem, hyp in zip(prems, hyps):
            el = f"Premise: {prem} "
            for key, elab in prem_elaborations.items():
                el += f"[{key.capitalize()}] {elab} "
            el += f"Hypothesis: {hyp} "
            for key, elab in hyp_elaborations.items():
                el += f"[{key.capitalize()}] {elab} "
            el += "Is there a contradiction or entailment between the premise and hypothesis ?"
            input_3_all.append(el)
        predictions["S3-all"] = self._prediction_pipeline(input_3_all, self.model_s3_alldims)

        # The input for system 4 is in the same format as for system 1
        predictions["S4"] = self.model_s4.predict([[in1, ""] for in1 in input_1])

        return predictions

    """Uses the predictions from each model to compute the final prediction of the ensemble"""

    def _ensemble_algorithm(self, model_outputs):
        # Firstly, the label is selected based on the majority between the 5 best models (according to the paper : systems 1, 2, 3-motivation, 3-alldims, 4)
        labels = [
            model_outputs[key].split(".")[0]
            for key in ["S1", "S2", "S3-mot", "S3-all", "S4"]
        ]
        # Sometimes, it might happen with the small models that the generated label is a mix of words, like 'Contratailment' or 'Endiction'
        for label in labels:
            if label not in ["Label: Contradiction", "Label: Entailment"]:
                labels.remove(label)
        unique, counts = np.unique(labels, return_counts=True)
        ix = np.argmax(counts)
        major_label = unique[ix]

        # Then, pick the explanation of the first system agreeing with the majority label, following an order indicated in the paper
        for key in ["S3-cons", "S3-emo", "S2", "S3-all", "S3-mot", "S4", "S1"]:
            substrings = model_outputs[key].split(".")
            label = substrings[0]
            explanation = substrings[1]

            if label == major_label:
                break

        return major_label + "." + explanation + "."

    """Expected input : [Premise_sentence, hypothesis_sentence] or list of inputs"""

    def predict(self, inputs):
        preds_dict = self._get_batched_predictions(inputs)
        preds = [{k: preds_dict[k][i] for k in preds_dict} for i in range(len(inputs))]
        final_pred = [self._ensemble_algorithm(pred) for pred in preds]

        return final_pred

In [4]:
operating_modes = [
    ("system_1", preprocess_dataset_s1),
    ("system_2", preprocess_dataset_s2),
    ("system_31", preprocess_dataset_s31),
    ("system_32", preprocess_dataset_s32),
    ("system_33", preprocess_dataset_s33),
    ("system_34", preprocess_dataset_s34),
    ("system_35", preprocess_dataset_s35),
    ("system_4", preprocess_dataset_s1),
    ("system_5", preprocess_dataset_s1),
    ("system_6", preprocess_dataset_s1),
    ("system_7", preprocess_dataset_s7),
]

In [5]:
def train_model(name, model, curr_ds, num_epochs=NUM_EPOCHS):
    training_args = Seq2SeqTrainingArguments(
        output_dir=f"{name}",
        learning_rate=3e-4,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=2 * BATCH_SIZE,
        save_total_limit=1,
        num_train_epochs=num_epochs,
        report_to="none",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        eval_accumulation_steps=1,
        logging_steps=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=curr_ds["train"],
        eval_dataset=curr_ds["val"].select(range(350)),
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    )

    trainer.train()

    return trainer

In [6]:
records = []

## Train + Evaluate

In [7]:
from IPython.display import clear_output

modes = {t: {name: [] for name, _ in operating_modes} for t in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']}

for train_idxs, val_idxs in indexes[4:7]:
    fold_dataset = DatasetDict(
        {"train": ds.select(train_idxs), "val": ds.select(val_idxs)}
    )

    for name, preprocess_func in operating_modes:
        curr_ds = fold_dataset.map(preprocess_func, batched=True).remove_columns(fold_dataset["train"].column_names)

        if name == "system_4":
            ds_41 = fold_dataset.map(preprocess_dataset_s41, batched=True).remove_columns(fold_dataset["train"].column_names)
            train_model("system_41", AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint), ds_41, 2)
            ds_42 = fold_dataset.map(preprocess_dataset_s42, batched=True).remove_columns(fold_dataset["train"].column_names)
            train_model("system_42", AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint), ds_42, 8)
        elif name == "system_5":
            pass
        else:
            if name == "system_6":
                model = AutoModelForSeq2SeqLM.from_pretrained("RicoBorra/T5-small-synthetic-FLUTE")
            else:
                model = AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint)
            trainer = train_model(name, model, curr_ds)

        # have to do batched rouge computation otherwise not enough memory
        rouge = evaluate.load("rouge")
        metrics = {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0}
        count = 0
        for i in range(0, len(curr_ds["val"]), N_GEN):
            count += 1
            if (name != "system_4") and (name != "system_5"):
                (predictions, _), label_ids, _ = trainer.predict(test_dataset=curr_ds["val"].select(range(i, min(i + N_GEN, len(curr_ds["val"])))))
                # delete stuff after EOS token
                predicted_token_ids = torch.argmax(torch.from_numpy(predictions), dim=-1)
                for j in range(predicted_token_ids.shape[0]):
                    ind = (predicted_token_ids[j] == 1).nonzero(as_tuple=True)[0]
                    if ind.numel() != 0:
                        predicted_token_ids[j, ind[0] :] = 1
                decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
                # clean decoded preds if needed
                if name == "system_2":
                    decoded_preds = [dp[dp.find("Label") :] for dp in decoded_preds]
                if name == "system_7":
                    decoded_preds = [ex[ex.find("Label: ") :] + " " + ex[: ex.find("Label: ")] for ex in decoded_preds]
            else:
                small_ds = curr_ds["val"].select(range(i, min(i + N_GEN, len(curr_ds["val"]))))
                label_ids = small_ds["labels"]
                max_len = max([len(el) for el in label_ids])
                label_ids = [el + (max_len - len(el)) * [0] for el in label_ids]
                inputs = tokenizer.batch_decode(small_ds["input_ids"], skip_special_tokens=True)
                if name == "system_4":
                    sys_4 = DREAM_FLUTE_System4(
                        tokenizer=None,
                        model_s41_path=get_path("system_41"),
                        model_s42_path=get_path("system_42"),
                    )
                    decoded_preds = sys_4.predict(inputs=[[in1, ""] for in1 in inputs])
                else:  # "name == system_5"
                    inputs = [get_prem_hyp(ex) for ex in inputs]
                    sys_5 = DREAM_FLUTE_Ensemble(
                        tokenizer_path=None,
                        s1_path=get_path("system_1"),
                        s2_path=get_path("system_2"),
                        s3_emo_path=get_path("system_31"),
                        s3_mot_path=get_path("system_32"),
                        s3_cons_path=get_path("system_33"),
                        s3_rot_path=get_path("system_34"),
                        s3_alldims_path=get_path("system_35"),
                        s4_clas_path=get_path("system_41"),
                        s4_exp_path=get_path("system_42"),
                        dream_path=None,
                    )
                    decoded_preds = sys_5.predict(inputs=inputs)

            # careful here
            labels = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            if name == "system_2":
                decoded_labels = [dp[dp.find("Label") :] for dp in decoded_labels]
            if name == "system_7":
                decoded_labels = [ex[ex.find("Label: ") :] + " " + ex[: ex.find("Label: ")] for ex in decoded_labels]
            new_metrics = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            for k in new_metrics:
                metrics[k] += new_metrics[k]

        if name == "system_4":
            del sys_4
        if name == "system_5":
            del sys_5
        if (name != "system_4") and (name != "system_5"):
            del trainer

        for k in metrics:
            metrics[k] /= count

        clear_output(wait=True)
        for k in metrics:
            modes[k][name].append(metrics[k])
        print(modes)
        # print(name, decoded_preds[0], decoded_labels[0])
        records.append({"name": name, "content": decoded_preds[0]})
        # print(predicted_token_ids[0], labels[0])

{'rouge1': {'system_1': [0.6541110604616821], 'system_2': [0.6492539377643075], 'system_31': [0.6547604933356668], 'system_32': [0.6501899650186255], 'system_33': [0.654412319742571], 'system_34': [0.6506737743649341], 'system_35': [0.6537319182836477], 'system_4': [0.524399108250037], 'system_5': [0.5126419818402839], 'system_6': [], 'system_7': []}, 'rouge2': {'system_1': [0.3971126893592013], 'system_2': [0.397006994099452], 'system_31': [0.4003323290815623], 'system_32': [0.39337242149643925], 'system_33': [0.3968850215470675], 'system_34': [0.3926166415695221], 'system_35': [0.3980990830439629], 'system_4': [0.29691910094640384], 'system_5': [0.28259423291941516], 'system_6': [], 'system_7': []}, 'rougeL': {'system_1': [0.6201226587556153], 'system_2': [0.6190267700382599], 'system_31': [0.6208742615001395], 'system_32': [0.6181346759453795], 'system_33': [0.6207930143162377], 'system_34': [0.618096674532199], 'system_35': [0.6206273990807694], 'system_4': [0.45205711415654476], '

Map:   0%|          | 0/6781 [00:00<?, ? examples/s]

Map:   0%|          | 0/753 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,1.4725,1.449973
2,1.4551,1.386363
3,1.4147,1.350236
4,1.2307,1.328239
5,1.2394,1.313765
6,1.0828,1.311675


## Examples

In [None]:
records.append({"name": "ground_truth", "content": decoded_labels[0]})
records[-11:]

[{'name': 'system_4',
  'content': 'Label: Entailment. Explanation: Slow drivers are often used to make people late to work and so getting behind them is not a contradiction or entailment between the two.'},
 {'name': 'ground_truth',
  'content': 'Label: Entailment. Explanation: Getting behind a slow driver often causes people to feel frustrated because it makes them late for their destination'}]

In [None]:
"""
{'system_1': [0.5056115530372459, 0.5085812472965738], 'system_2': [0.5323239453833644, 0.5374525863206762], 'system_31': [0.5084813364883226, 0.5078069259932709], 'system_32': [0.5055255195638098, 0.5074808302929493], 'system_33': [0.5032615710245999, 0.508591734512484], 'system_34': [0.5067396302748032, 0.5087587260751042], 'system_35': [0.5036829298423671, 0.5073656197676405]}
{'system_1': [0.47329270980754923], 'system_2': [0.5039333868999851], 'system_31': [0.47420432731446144], 'system_32': [0.473849787491779], 'system_33': [0.47287909150934293], 'system_34': [0.47171914165316353], 'system_35': [0.47436950304876985]}
{'system_1': [0.5221458462682274, 0.4906728974226775], 'system_2': [0.5453307495230577, 0.5189864534869755], 'system_31': [0.524029905213074, 0.49432447432649923], 'system_32': [0.5236538569098267, 0.4949640794043393], 'system_33': [0.522696825432352, 0.49418902866839187], 'system_34': [0.5231002650281118, 0.49245360907178504], 'system_35': [0.5238553839735753, 0.4967562420314123]}
{'system_1': [0.5201383622010409, 0.5019083214066056], 'system_2': [0.5451744732719431, 0.5273190341222926], 'system_31': [0.5192974336052382, 0.5002995114825706], 'system_32': [0.5188841013023715, 0.5007719063422055], 'system_33': [0.5203219622545853, 0.49992736417041994], 'system_34': [0.5180675680247785, 0.5008015168679895], 'system_35': [0.5224425242712794, 0.5010754435268072]}
{'system_1': [0.49590791112913984], 'system_2': [0.5147096144113823], 'system_31': [0.4927430972029428], 'system_32': [0.49258799343269694], 'system_33': [0.49049952341283654], 'system_34': [0.49317378483047924], 'system_35': [0.4919880906784929]}
{'system_1': [0.498351050356349, 0.5090844074007058], 'system_2': [0.5289706096196247, 0.535423121872322], 'system_31': [0.5004418699772601, 0.5081924349139884], 'system_32': [0.5000023839289528, 0.5069172466751917], 'system_33': [0.49609349538092806, 0.5060478473679386], 'system_34': [0.49683869684920356, 0.5072891134960673], 'system_35': [0.49873229852851614, 0.5063988604083957]}
"""

"\n{'system_1': [0.5056115530372459, 0.5085812472965738], 'system_2': [0.5323239453833644, 0.5374525863206762], 'system_31': [0.5084813364883226, 0.5078069259932709], 'system_32': [0.5055255195638098, 0.5074808302929493], 'system_33': [0.5032615710245999, 0.508591734512484], 'system_34': [0.5067396302748032, 0.5087587260751042], 'system_35': [0.5036829298423671, 0.5073656197676405]}\n{'system_1': [0.47329270980754923], 'system_2': [0.5039333868999851], 'system_31': [0.47420432731446144], 'system_32': [0.473849787491779], 'system_33': [0.47287909150934293], 'system_34': [0.47171914165316353], 'system_35': [0.47436950304876985]}\n{'system_1': [0.5221458462682274, 0.4906728974226775], 'system_2': [0.5453307495230577, 0.5189864534869755], 'system_31': [0.524029905213074, 0.49432447432649923], 'system_32': [0.5236538569098267, 0.4949640794043393], 'system_33': [0.522696825432352, 0.49418902866839187], 'system_34': [0.5231002650281118, 0.49245360907178504], 'system_35': [0.5238553839735753, 

In [None]:
inputs[0]

['This operation is obscured and not transparent. ',
 'This operation is as transparent as tinted glass. Is there a contradiction or entailment between the premise and hypothesis?']

In [None]:
modes

{'system_1': [0.6544642734433673, 0.6538472479091643, 0.6451995135621926],
 'system_2': [0.6591784410473625, 0.6556348348860563, 0.6485904916673478],
 'system_31': [0.6616151178011304, 0.6529529681168658, 0.6457822724848103],
 'system_32': [0.6602817794390899, 0.6454009215237158, 0.6454316035156613],
 'system_33': [0.6577626817560993, 0.654545940696381, 0.6495673748102144],
 'system_34': [0.6577570152042305, 0.6535457188788231, 0.6434608160222891],
 'system_35': [0.6555744419703384, 0.6538046517382146, 0.6470891971959015],
 'system_4': [0.52284036210826, 0.5163325460813661, 0.5156399803184772],
 'system_5': [0.5240891672641392, 0.5080151274468501, 0.5057846437218605],
 'system_6': [0.6635674632696552, 0.6627904866947751, 0.6585561479873518],
 'system_7': [0.6611524853455227, 0.6590132359931218, 0.6561421023328687]}

In [None]:
from scipy.stats import ttest_rel

# paired t test
for mode in modes:
    print(mode)
    for key in list(modes[mode].keys())[1:]:
        print("---", key, ttest_rel(modes[mode][key], modes[mode]["system_1"], alternative="greater"))

system_2 TtestResult(statistic=23.19102411438574, pvalue=1.2259514290288666e-09, df=9)
system_31 TtestResult(statistic=0.5948201499859518, pvalue=0.2833078171850142, df=9)
system_32 TtestResult(statistic=-0.15212858962856382, pvalue=0.5587793758146149, df=9)
system_33 TtestResult(statistic=-1.4525349538900345, pvalue=0.9098456440410233, df=9)
system_34 TtestResult(statistic=-1.3719083987600567, pvalue=0.898341997218993, df=9)
system_35 TtestResult(statistic=0.10660074133722681, pvalue=0.45872204141041606, df=9)
