## SETUP AND DATASET LOADING

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import pandas as pd

In [None]:
path_prefix = "/content/gdrive/MyDrive/DATASETS/NLI/"

In [None]:
cd /content/gdrive/MyDrive/DATASETS/NLI/

In [None]:
ls

In [None]:
NLI_DATASETS = ["scitail", "anli", "glue-mnli", "sick", "superglue-cb"]

superglue cb
sick
scitail
anli
glue-mnli


In [None]:
NLI_DATASETS

In [None]:
import pandas as pd
import re

In [None]:
DATASET_SEEDS = [13, 21, 42, 87, 100]
WORDS_TO_REMOVE = ["context:", "premise:", "sentence 1:", "sentence 2:", "sentence:", "hypothesis:"]

In [None]:
def import_dataset(dataset_index, test_set = False):
  dataset_name = NLI_DATASETS[dataset_index]
  print(dataset_name)
  seed = 100 #test_set is equal independently on random seed
  if test_set == False:
    test_set_path = f"{dataset_name}/{dataset_name}_16_{seed}_dev.tsv"
  else:
    test_set_path = f"{dataset_name}/{dataset_name}_16_{seed}_test.tsv"
  #print(test_set_path)
  df = pd.read_csv(test_set_path , sep="\t", header=None)

  df[0] = df[0].apply(preprocess_nli)
  #print(df[0][0])
  return (df[0], df[1])

In [None]:
def preprocess_nli(query):
  for substr in  WORDS_TO_REMOVE:
    query = query.replace(substr, "")

  ph = re.split("\[SEP\]", query)
  #return f"premise:{ph[0]} hypothesis:{ph[1]}"
  return (ph[0], ph[1])

In [None]:
for dataset_inex in range (0, len(NLI_DATASETS)):
  import_dataset(dataset_inex)

In [None]:
df = pd.read_csv("anli/an_16_100_test.tsv", sep="\t", header = None)

In [None]:
df.shape

## PROMPTS


The scitail dataset is setted up for binary NLI: possible answeres are entailment and not entailment.

In [None]:
NLI_TRUE_FALSE_PROMPTS = [
  "Premise: {CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\nDoes the premise entails the hypothesis?\n\n{OPTIONS}",
  "Premise: {CONTEXT}\nHypothesis: {HYPOTHESIS}\nIs the hypothesis entailed by the premise?\n{OPTIONS}",
  "Here is a premise:\n{CONTEXT}\n\nHere is a hypothesis:\n{HYPOTHESIS}\n\ nIs it possible to conclude that if the premise is true, then so is the hypothesis?\n{OPTIONS}",
  "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\nIs this second sentence entailed by the first sentence?\n\n{OPTIONS}",
  "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\n\nIf the first sentence is true, then is the second sentence true?\n{OPTIONS}",
  'Based on the premise \"{CONTEXT}\", can we conclude the hypothesis \"{HYPOTHESIS}\" is true?\n\n{OPTIONS}',
  'Premise: \"{CONTEXT}\" If this premise is true, what does that tell us about whether it entails the hypothesis \"{HYPOTHESIS}\"?\n\n{OPTIONS}',
  'Premise:\n\"{CONTEXT}\" Based on this premise, is the hypothesis \"{HYPOTHESIS}\" true?\n {OPTIONS}',
  'If {CONTEXT}, can we conclude that \"{HYPOTHESIS}\"?\n{OPTIONS}',
  '{CONTEXT}\n\nDoes it follow that \"{HYPOTHESIS}\"?\n{OPTIONS}'
]

In [None]:
NLI_PROMPTS_SINGLE_SENTENCE = [
    "Choose the correct label among: {OPTIONS} for the following Natural Language Inference task: {SENTENCE}",
    "Read the follwing sentence. sentence: {SENTENCE}. Is it an 'entailment', a 'contradiction' or is it 'neutral'?",
    "Assign one of the following labels: {OPTIONS} to the following sentence S: {SENTENCE}" ,
    "The following assertion: '{SENTENCE}' is {OPTIONS}?",
    "A sentence can be of one type among: {OPTIONS}. Which type of sentence is [{SENTENCE}]?",
]

In [None]:
NLI_PROMPT_CONTEXT_HYPO = [
    "{CONTEXT}\nBased on the paragraph above can we conclude that the hypothesis: \"{HYPOTHESIS}\" is more likely to be a kind of {OPTIONS}? Choose only one of answer among theese three.",
    "{CONTEXT}\n\nBased on that paragraph can we conclude that this sentence is true?\n{HYPOTHESIS}\n\n{OPTIONS}",
    "{CONTEXT}\n\nCan we draw the following conclusion?\n{HYPOTHESIS}\n\n{OPTIONS}"
    "{CONTEXT}\nDoes this next sentence follow, given the preceding text?\n{HYPOTHESIS}\n\n{OPTIONS}",
    "{CONTEXT}\nCan we infer the following?\n{HYPOTHESIS}\n\n{OPTIONS}",
    "Read the following paragraph and determine if the hypothesis is true:\n\n{CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\n{OPTIONS}",
    "Read the text and determine if the sentence is true:\n\n{CONTEXT}\n\nSentence: {HYPOTHESIS}\n\n{OPTIONS}",
    "Can we draw the following hypothesis from the CONTEXT? \n\nCONTEXT:\n\n{CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\n{OPTIONS}",
    "Determine if the sentence is true based on the text below:\n{HYPOTHESIS}\n\n{CONTEXT}\n{OPTIONS}",
    "Premise: {CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\nDoes the premise entail the hypothesis?\n\n{OPTIONS}",
    "Premise: {CONTEXT}\nHypothesis: {HYPOTHESIS}\nIs the hypothesis entailed by the premise?\n{OPTIONS}",
    "Here is a premise:\n{CONTEXT}\n\nHere is a hypothesis:\n{HYPOTHESIS}\n\nIs it possible to conclude that if the premise is true, then so is the hypothesis?\n{OPTIONS}",
    "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\nIs this second sentence entailed by the first sentence?\n\n{OPTIONS}",
    "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\n\nIf the first sentence is true, then is the second sentence true?\n{OPTIONS}",
    'Based on the premise "{CONTEXT}", can we conclude the hypothesis {HYPOTHESIS}" is true?\n\n{OPTIONS}',
    'Premise: "{CONTEXT}" If this premise is true, what does that tell us about whether it entails the hypothesis "{HYPOTHESIS}"?\n\n{OPTIONS}',
    'Premise:\n"{CONTEXT}" Based on this premise, is the hypothesis "{HYPOTHESIS}" true?\n{OPTIONS}',
    'If {CONTEXT}, can we conclude that "{HYPOTHESIS}"?\n{OPTIONS}',
    '{CONTEXT}\n\nDoes it follow that "{HYPOTHESIS}"?\n{OPTIONS}'
]

In [None]:
NLI_PROMPT_CONTEXT_HYPO[0]

In [None]:
from random import shuffle

def adapt_prompt(prompt_index, sentence, context_hypo_type = False, two_options_nli = False):
  if two_options_nli == False:
    options = ['entailment','neutral', 'contradiction']
    shuffle(options)
    stringyfied_options = f"['{options[0]}', '{options[1]}', '{options[2]}']";

    if context_hypo_type:
      context, hypo = sentence;
      return NLI_PROMPT_CONTEXT_HYPO[prompt_index].format(OPTIONS=stringyfied_options, CONTEXT=context, HYPOTHESIS=hypo)
    else:
      return NLI_PROMPTS_SINGLE_SENTENCE[prompt_index].format(OPTIONS = options, SENTENCE = sentence)
  #if !two_options_nli:
  else:
    options = ['true','false']
    context, hypo = sentence;
    return NLI_TRUE_FALSE_PROMPTS[prompt_index].format(OPTIONS=options, CONTEXT=context, HYPOTHESIS=hypo)

##MODELS

In [None]:
pip install --upgrade pip

In [None]:
!pip install transformers

### FLAN T5

While we qualitatively find that FLAN responds well to most tasks, it does fail on some simple tasks.
For instance, as shown in Figure 22, FLAN fails at the very simple task of returning the second word
in a sentence, and also incorrectly translates a question to Danish when asked to answer the question
in Danish. Additional limitations include a context length of only 1024 tokens (which is not enough
for most summarization tasks), and that the model was mostly trained on English data.

https://openreview.net/pdf?id=gEZrGCozdqR

In [None]:
!pip install sentencepiece

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

flan_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
flan_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")



In [None]:
def askFlanNLI(data, solutions, prompt_index = 0, context_hypo_prompt = False, two_options_nli=False):
  results = []
  for sentence in data:
    task_prefix = adapt_prompt(prompt_index, sentence, context_hypo_prompt, two_options_nli)

    comp = solutions.tolist()

    input_ids = flan_tokenizer(task_prefix, return_tensors="pt", padding=True)

    output_sequences = flan_model.generate(input_ids=input_ids["input_ids"],
                                      max_length=1000)
    result = flan_tokenizer.batch_decode(output_sequences, skip_special_tokens=True)


    result = result[0].lower()

    if not two_options_nli and (result == 'no' or result ==  'conflict'):
      results.append('contradiction')
    elif result == 'yes':
      results.append('entailment')
    else:
      results.append('neutral')

  for pos in range (0, len(results)):
    #print(results[pos])
    if(results[pos] == comp[pos]):
      results[pos] = True
    else:
      results[pos] = False

  return results

### BART


In [None]:
from transformers import pipeline
bart_classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [None]:
def askBartNLI(data, solutions, prompt_index=None, context_hypo_prompt = False, two_options_nli=False):
  results = []
  solutions = solutions.tolist()

  #Also check the non classifier version and the unprompetd version for the classifier

  for i in range (, len(data)):
    if prompt_index != None:
      prompt = adapt_prompt(prompt_index, data[i], context_hypo_prompt, two_options_nli)
    else:
      prompt = data[i]

    #print(classifier(data[i], ['contradiction', 'neutral', 'entailment']))
    if two_options_nli:
      answer = bart_classifier(prompt, ['true', 'false'])['labels'][0]

      if answer == 'false':
        answer = 'neutral'
      else:
        answer = 'entailment'

    else:
      answer = bart_classifier(prompt, ['contradiction', 'neutral', 'entailment'])['labels'][0]

    #print(answer)
    if(answer == solutions[i]):
      results.append(True)
    else:
      results.append(False)

  return results

### GPT2


In [None]:
# Check if larger versions exist and can be run.

from transformers import GPT2Tokenizer, GPT2LMHeadModel

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


In [None]:
from string import Template
import numpy as np
import torch

#import streamlit as st

def gpt2_classify(inputs, candidate_labels, prompt="$input is a type of "):

    if type(candidate_labels) == list:
        candidate_labels = {k: [k] for k in candidate_labels}

    # Add the final space of the prompt to each candidate label
    if prompt[-1] == " ":
        content_prefix, prompt = " ", prompt[:-1]
    else:
        content_prefix = ""

    # Encode the part of the prompt which is independent of the inputs
    prompt_prefix = prompt.split("$input")[0]
    if len(prompt_prefix) > 0:
        prompt = prompt[len(prompt_prefix) :]
        if prompt_prefix[-1] == " ":
            prompt_prefix, prompt = prompt_prefix[:-1], " " + prompt
        inputs_prefix = gpt2_tokenizer(prompt_prefix, return_tensors="pt")
        with torch.no_grad():
            past_key_values_prefix = gpt2_model(**inputs_prefix).past_key_values
        del inputs_prefix

    scores = []
    # For each input, encode the input-dependent part of the prompt
    for input in inputs:
        inputs = gpt2_tokenizer.encode(Template(prompt).substitute(input=input))
        inputs = torch.tensor([inputs])
        with torch.no_grad():
            if len(prompt_prefix) > 0:
                outputs = gpt2_model(inputs, past_key_values=past_key_values_prefix)
            else:
                outputs = gpt2_model(inputs)
        del inputs
        probs = torch.softmax(outputs.logits[0, -1, :], -1).detach().cpu().numpy()
        past_key_values = outputs.past_key_values
        del outputs

        scores.append({"sequence": input})
        scores[-1]["labels"] = [k for k in candidate_labels]
        scores[-1]["scores"] = []
        # Get the probability of each candidate label being the next tokens
        for k in candidate_labels:
            scores[-1]["scores"].append(0)
            for word in candidate_labels[k]:
                tokens = gpt2_tokenizer.encode(content_prefix + word)
                prob = probs[tokens[0]]
                if len(tokens) > 1:
                    with torch.no_grad():
                        outputs2 = gpt2_model(
                            torch.tensor([tokens[:-1]]),
                            past_key_values=past_key_values,
                        )
                    probs2 = (
                        torch.softmax(outputs2.logits[0, :, :], -1)
                        .detach()
                        .cpu()
                        .numpy()
                    )
                    del outputs2
                    for i in range(1, len(tokens)):
                        prob *= probs2[i - 1, tokens[i]]
                scores[-1]["scores"][-1] += prob
        # Normalize the scores to get probabilities
        sum_scores = sum(scores[-1]["scores"])
        scores[-1]["scores"] = [x / sum_scores for x in scores[-1]["scores"]]

    return scores


In [None]:
def askGPT2NLI(premise, solutions, prompt_index=None,  context_hypo_prompt = False, two_options_nli=False):

  res = []
  results = []
  solutions = solutions.tolist()
  for i in range (0, len(premise)):

    if prompt_index != None :
      prompt = adapt_prompt(prompt_index, premise[i],  context_hypo_prompt, two_options_nli)
    else:
      prompt = premise[i]

    if two_options_nli:
      res = gpt2_classify([prompt], ['true', 'false'])
      answer = res[0]["labels"][res[0]["scores"].index(max(res[0]["scores"]))]
      if answer == 'false':
        answer = 'neutral'
      else:
        answer = 'entailment'

    else:
      res = gpt2_classify([prompt], ['contradiction', 'neutral', 'entailment'])
      answer = res[0]["labels"][res[0]["scores"].index(max(res[0]["scores"]))]

    #print(res[0]["labels"][res[0]["scores"].index(max(res[0]["scores"]))])
    if(answer == solutions[i]):
      results.append(True)
    else:
      results.append(False)

  return results

### GPT3

In [None]:
pip install openai

In [None]:
import openai as ai

ai.api_key = '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' # replace with your key from earlier

In [None]:
def generate_gpt3_response(user_text, print_output=False):
    """
    Query OpenAI GPT-3 for the specific key and get back a response
    :type user_text: str the user's text to query for
    :type print_output: boolean whether or not to print the raw output JSON
    """
    completions = ai.Completion.create(
        engine='text-davinci-003',  # Determines the quality, speed, and cost.
        temperature=0.5,            # Level of creativity in the response
        prompt=user_text,           # What the user typed in
        max_tokens=100,             # Maximum tokens in the prompt AND response
        n=1,                        # The number of completions to generate
        stop=None,                  # An optional setting to control response generation
    )

    # Displaying the output can be helpful if things go wrong
    if print_output:
        print(completions)

    # Return the first choice's text
    return completions.choices[0].text

In [None]:
def askGPT3NLI(query, answers, prompt_index=0, context_hypo_prompt = False, two_options_nli=False):
  results = []
  model_answers = []

  for elem in query:
      prompt = adapt_prompt(prompt_index, query, context_hypo_prompt,two_options_nli)
      results.append(generate_gpt3_response(prompt)) ## TODO check if answer is correct in a second moment.

  for pos in range (0, len(results)):
    print(results[pos])
    if(results[pos] == answers[pos]):
      results[pos] = True
    else:
      results[pos] = False


  return results

##  EXPERIMENTS

### EXECUTION PIPELINE

In [None]:
def askNLI(model, questions, answers, prompt_index, context_hypo_prompt = False, two_options_nli=False):

  if model == 'BART':
    results = askBartNLI(questions, answers, prompt_index, context_hypo_prompt, two_options_nli)
  elif model == 'FLAN':
    results = askFlanNLI(questions, answers, prompt_index, context_hypo_prompt, two_options_nli)
  elif model == 'GPT2':
    results = askGPT2NLI(questions, answers, prompt_index, context_hypo_prompt, two_options_nli)
  elif model == 'GPT3':
    results = askGPT3NLI(questions, answers, prompt_index, context_hypo_prompt, two_options_nli)
  else:
    results = askBartNLI(questions, answers, prompt_index, context_hypo_prompt, two_options_nli)

  return results


In [None]:
NLI_DATASETS[0]

In [None]:
str = "";
def test_on_datasets(model, test_mode = False):
  binary_task = False;
  for dataset_index in range (1, len(NLI_DATASETS)):
    if dataset_index == 0:
      binary_task = True;
      single_sentence_propmts_num = 0;
      prompt_num = len(NLI_TRUE_FALSE_PROMPTS);
    else:
      binary_task = False;
      single_sentence_propmts_num = len(NLI_PROMPTS_SINGLE_SENTENCE)
      prompt_num = single_sentence_propmts_num+len(NLI_PROMPT_CONTEXT_HYPO);

    prompt_to_use = range(0, prompt_num) if test_mode == False else best_prompts[model][dataset_index]
    for prompt_index in prompt_to_use:
      question, answer = import_dataset(dataset_index, test_mode)

      if dataset_index == 2 and test_mode:
        question = question[0:1000]
        answer = answer[0:1000]

      context_hypo_prompt = prompt_index >= single_sentence_propmts_num

      prompt_index = prompt_index - single_sentence_propmts_num if context_hypo_prompt else prompt_index;

      results = askNLI(model, question, answer, prompt_index, context_hypo_prompt, binary_task)

      dataset_name = NLI_DATASETS[dataset_index]

      corrects = sum(bool(x) for x in results)

      str = f"Model: {model}\tDataset name: {dataset_name}\t Prompt with context: {context_hypo_prompt}\t Prompt index: {prompt_index}\t - Correct Answers {corrects}/{len(results)} - {corrects/len(results)}"
      print(str)
      corrects = 0
    print("\n\n")

In [None]:
question, answer = import_dataset(2, True)

### PROMPT SELECTION

In [None]:
test_on_datasets('FLAN')

In [None]:
test_on_datasets('BART')

In [None]:
test_on_datasets('GPT2')

### EVALUATION

In [None]:
bset_prompts = {}

In [None]:
best_prompts = {
    "FLAN-SMALL":[[ #best_prompts for "scitail"
        1, 6, 8
      ],[ #best_prompts for "anli"
        6, 11, 15
      ],[ #best_prompts for "glue-mnli"
        9, 10, 11
      ],[ #best_prompts for  "sick"
        5, 6, 11
      ],[ #best_prompts for "superglue-cb"
        15, 16, 21
      ]],
      "FLAN":[[ #best_prompts for "scitail"
        0, 1, 6
      ],[ #best_prompts for "anli"
        13, 15, 21
      ],[ #best_prompts for "glue-mnli"
        9, 11, 16
      ],[ #best_prompts for  "sick"
        14, 16, 20
      ],[ #best_prompts for "superglue-cb"
        12, #17, 18
      ]],
    "BART": [[ #best_prompts for "scitail"
        1, 5, 8
      ],[ #best_prompts for "anli"
        2, 11, 21
      ],[ #best_prompts for "glue-mnli"
        2#, 12, 19
      ],[ #best_prompts for  "sick"
        1, 11, 18
      ],[ #best_prompts for "superglue-cb"
        1, 3, 20
      ]],
    "GPT2": [[ #best_prompts for "scitail"
        1, 5, 8
      ],[ #best_prompts for "anli"
        9, 18, 21
      ],[ #best_prompts for "glue-mnli"
        4, 15, 19
      ],[ #best_prompts for  "sick"
        4, 14, 19
      ],[ #best_prompts for "superglue-cb"
        1, 3, 4
      ]],
    "GPT3": [[ #best_prompts for "scitail

      ],[ #best_prompts for "anli"

      ],[ #best_prompts for "glue-mnli"

      ],[ #best_prompts for  "sick"

      ],[ #best_prompts for "superglue-cb"

    ]]
}

### BINARY NLI

In [None]:
test_on_datasets('FLAN', True)

In [None]:
test_on_datasets('GPT2', True)

In [None]:
test_on_datasets('BART', True)

In [None]:
test_on_datasets('BART', True)

### NLI (3 OPTIONS)

#### ANLI

In [None]:
test_on_datasets('FLAN', True)

In [None]:
test_on_datasets('BART', True)

In [None]:
test_on_datasets('GPT2', True)

#### GLUE-MNLI

In [None]:
test_on_datasets('FLAN', True)

In [None]:
test_on_datasets('GPT2', True)

#### SICK

In [None]:
test_on_datasets('BART', True)

#### SUPEGLUE-CB

### TIME

In [None]:
import time

In [None]:
def time_tests(model, test_mode = True):
  binary_task = False;
  for dataset_index in range (0, len(NLI_DATASETS)):
    if dataset_index == 0:
      binary_task = True;
      single_sentence_propmts_num = 0;
      prompt_num = len(NLI_TRUE_FALSE_PROMPTS);
    else:
      binary_task = False;
      single_sentence_propmts_num = len(NLI_PROMPTS_SINGLE_SENTENCE)
      prompt_num = single_sentence_propmts_num+len(NLI_PROMPT_CONTEXT_HYPO);

    prompt_index = best_prompts[model][dataset_index][0]
    question, answer = import_dataset(dataset_index, test_mode)

    if test_mode:
      question = question[0:32]
      answer = answer[0:32]

    context_hypo_prompt = prompt_index >= single_sentence_propmts_num

    prompt_index = prompt_index - single_sentence_propmts_num if context_hypo_prompt else prompt_index;

    start = time.time()
    results = askNLI(model, question, answer, prompt_index, context_hypo_prompt, binary_task)
    end = time.time()

    dataset_name = NLI_DATASETS[dataset_index]

    str = f"Model: {model}\tDataset name: {dataset_name}\t - Time: {end - start}"
    print(str)

In [None]:
time_tests("FLAN", True)

In [None]:
time_tests("GPT2", True)

In [None]:
time_tests("BART", True)