In [None]:
!pip install transformers
!huggingface-cli login --token

In [None]:
!pip install accelerate
!pip install xformers

## SETUP AND DATASET LOADING

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import pandas as pd

In [None]:
path_prefix = "/content/gdrive/MyDrive/DATASETS/NLI/"

In [None]:
cd /content/gdrive/MyDrive/DATASETS/NLI/

In [None]:
ls

In [None]:
NLI_DATASETS = ["scitail", "anli", "glue-mnli", "sick", "superglue-cb"]

superglue cb
sick
scitail
anli
glue-mnli


In [None]:
NLI_DATASETS

In [None]:
import pandas as pd
import re

In [None]:
DATASET_SEEDS = [13, 21, 42, 87, 100]
WORDS_TO_REMOVE = ["context:", "premise:", "sentence 1:", "sentence 2:", "sentence:", "hypothesis:"]

In [None]:
def import_dataset(dataset_index, test_set = False):
  dataset_name = NLI_DATASETS[dataset_index]
  print(dataset_name)
  seed = 100 #test_set is equal independently on random seed
  if test_set == False:
    test_set_path = f"{dataset_name}/{dataset_name}_16_{seed}_dev.tsv"
  else:
    test_set_path = f"{dataset_name}/{dataset_name}_16_{seed}_test.tsv"
  #print(test_set_path)
  df = pd.read_csv(test_set_path , sep="\t", header=None)

  df[0] = df[0].apply(preprocess_nli)
  #print(df[0][0])
  return (df[0], df[1])

In [None]:
def preprocess_nli(query):
  for substr in  WORDS_TO_REMOVE:
    query = query.replace(substr, "")

  ph = re.split("\[SEP\]", query)
  #return f"premise:{ph[0]} hypothesis:{ph[1]}"
  return (ph[0], ph[1])

In [None]:
for dataset_inex in range (0, len(NLI_DATASETS)):
  import_dataset(dataset_inex)

## PROMPTS


The scitail dataset is setted up for binary NLI: possible answeres are entailment and not entailment.

In [None]:
NLI_TRUE_FALSE_PROMPTS = [
  "Premise: {CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\nDoes the premise entails the hypothesis?\n\n{OPTIONS}",
  "Premise: {CONTEXT}\nHypothesis: {HYPOTHESIS}\nIs the hypothesis entailed by the premise?\n{OPTIONS}",
  "Here is a premise:\n{CONTEXT}\n\nHere is a hypothesis:\n{HYPOTHESIS}\n\ nIs it possible to conclude that if the premise is true, then so is the hypothesis?\n{OPTIONS}",
  "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\nIs this second sentence entailed by the first sentence?\n\n{OPTIONS}",
  "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\n\nIf the first sentence is true, then is the second sentence true?\n{OPTIONS}",
  'Based on the premise \"{CONTEXT}\", can we conclude the hypothesis \"{HYPOTHESIS}\" is true?\n\n{OPTIONS}',
  'Premise: \"{CONTEXT}\" If this premise is true, what does that tell us about whether it entails the hypothesis \"{HYPOTHESIS}\"?\n\n{OPTIONS}',
  'Premise:\n\"{CONTEXT}\" Based on this premise, is the hypothesis \"{HYPOTHESIS}\" true?\n {OPTIONS}',
  'If {CONTEXT}, can we conclude that \"{HYPOTHESIS}\"?\n{OPTIONS}',
  '{CONTEXT}\n\nDoes it follow that \"{HYPOTHESIS}\"?\n{OPTIONS}'
]

In [None]:
NLI_PROMPTS_SINGLE_SENTENCE = [
    "Choose the correct label among: {OPTIONS} for the following Natural Language Inference task: {SENTENCE}",
    "Read the follwing sentence. sentence: {SENTENCE}. Is it an 'entailment', a 'contradiction' or is it 'neutral'?",
    "Assign one of the following labels: {OPTIONS} to the following sentence S: {SENTENCE}" ,
    "The following assertion: '{SENTENCE}' is {OPTIONS}?",
    "A sentence can be of one type among: {OPTIONS}. Which type of sentence is [{SENTENCE}]?",
]

In [None]:
NLI_PROMPT_CONTEXT_HYPO = [
    "{CONTEXT}\nBased on the paragraph above can we conclude that the hypothesis: \"{HYPOTHESIS}\" is more likely to be a kind of {OPTIONS}? Choose only one of answer among theese three.",
    "{CONTEXT}\n\nBased on that paragraph can we conclude that this sentence is true?\n{HYPOTHESIS}\n\n{OPTIONS}",
    "{CONTEXT}\n\nCan we draw the following conclusion?\n{HYPOTHESIS}\n\n{OPTIONS}"
    "{CONTEXT}\nDoes this next sentence follow, given the preceding text?\n{HYPOTHESIS}\n\n{OPTIONS}",
    "{CONTEXT}\nCan we infer the following?\n{HYPOTHESIS}\n\n{OPTIONS}",
    "Read the following paragraph and determine if the hypothesis is true:\n\n{CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\n{OPTIONS}",
    "Read the text and determine if the sentence is true:\n\n{CONTEXT}\n\nSentence: {HYPOTHESIS}\n\n{OPTIONS}",
    "Can we draw the following hypothesis from the CONTEXT? \n\nCONTEXT:\n\n{CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\n{OPTIONS}",
    "Determine if the sentence is true based on the text below:\n{HYPOTHESIS}\n\n{CONTEXT}\n{OPTIONS}",
    "Premise: {CONTEXT}\n\nHypothesis: {HYPOTHESIS}\n\nDoes the premise entail the hypothesis?\n\n{OPTIONS}",
    "Premise: {CONTEXT}\nHypothesis: {HYPOTHESIS}\nIs the hypothesis entailed by the premise?\n{OPTIONS}",
    "Here is a premise:\n{CONTEXT}\n\nHere is a hypothesis:\n{HYPOTHESIS}\n\nIs it possible to conclude that if the premise is true, then so is the hypothesis?\n{OPTIONS}",
    "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\nIs this second sentence entailed by the first sentence?\n\n{OPTIONS}",
    "Sentence 1: {CONTEXT}\n\nSentence 2: {HYPOTHESIS}\n\nIf the first sentence is true, then is the second sentence true?\n{OPTIONS}",
    'Based on the premise "{CONTEXT}", can we conclude the hypothesis {HYPOTHESIS}" is true?\n\n{OPTIONS}',
    'Premise: "{CONTEXT}" If this premise is true, what does that tell us about whether it entails the hypothesis "{HYPOTHESIS}"?\n\n{OPTIONS}',
    'Premise:\n"{CONTEXT}" Based on this premise, is the hypothesis "{HYPOTHESIS}" true?\n{OPTIONS}',
    'If {CONTEXT}, can we conclude that "{HYPOTHESIS}"?\n{OPTIONS}',
    '{CONTEXT}\n\nDoes it follow that "{HYPOTHESIS}"?\n{OPTIONS}'
]

In [None]:
NLI_PROMPT_CONTEXT_HYPO[0]

In [None]:
from random import shuffle

def adapt_prompt(prompt_index, sentence, context_hypo_type = False, two_options_nli = False):
  if two_options_nli == False:
    options = ['entailment','neutral', 'contradiction']
    shuffle(options)
    stringyfied_options = f"['{options[0]}', '{options[1]}', '{options[2]}']";

    if context_hypo_type:
      context, hypo = sentence;
      return NLI_PROMPT_CONTEXT_HYPO[prompt_index].format(OPTIONS=stringyfied_options, CONTEXT=context, HYPOTHESIS=hypo)
    else:
      return NLI_PROMPTS_SINGLE_SENTENCE[prompt_index].format(OPTIONS = options, SENTENCE = sentence)
  #if !two_options_nli:
  else:
    options = ['true','false']
    context, hypo = sentence;
    return NLI_TRUE_FALSE_PROMPTS[prompt_index].format(OPTIONS=options, CONTEXT=context, HYPOTHESIS=hypo)

##MODEL

In [None]:
from transformers import AutoTokenizer
import transformers
import torch

In [None]:

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "zero-shot-classification",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)


In [None]:
def askLlamaNLI(data, solutions, prompt_index = 0, context_hypo_prompt = False, two_options_nli=False):
  results = []
  for i in range(len(data)):
    task_prefix = adapt_prompt(prompt_index, data[i], context_hypo_prompt, two_options_nli)

    comp = solutions.tolist()

    candidate_labels = [ "neutral", "entailment"]  if two_options_nli else ["contradiction", "neutral", "entailment"]

    sequence = pipeline(
      task_prefix,
      candidate_labels = candidate_labels,
      max_length=500
    )

    answer = sequence["labels"][0]

    if(answer == solutions[i]):
      results.append(True)
    else:
      results.append(False)

  return results

In [None]:
best_prompts = [
    [0,7,8],
    [0,2,12],
    [0,3,6],
    [12,19,21],
    [5,6,9]
]

In [None]:
str = "";

import warnings
warnings.filterwarnings("ignore")


def test_on_datasets(test_mode = False):
  binary_task = False;
  for dataset_index in range(3,4): ## len(NLI_DATASETS)):
    if dataset_index == 0:
      binary_task = True;
      single_sentence_propmts_num = 0;
      prompt_num = len(NLI_TRUE_FALSE_PROMPTS);
    else:
      binary_task = False;
      single_sentence_propmts_num = len(NLI_PROMPTS_SINGLE_SENTENCE)
      prompt_num = single_sentence_propmts_num+len(NLI_PROMPT_CONTEXT_HYPO);

    prompt_to_use = range(0, prompt_num) if test_mode == False else best_prompts[dataset_index]
    for prompt_index in prompt_to_use:
      question, answer = import_dataset(dataset_index, test_mode)

      if dataset_index == 2 and test_mode:
        question = question[0:1000]
        answer = answer[0:1000]

      context_hypo_prompt = prompt_index >= single_sentence_propmts_num

      prompt_index = prompt_index - single_sentence_propmts_num if context_hypo_prompt else prompt_index;

      results = askLlamaNLI(question, answer, prompt_index, context_hypo_prompt, binary_task)

      dataset_name = NLI_DATASETS[dataset_index]

      corrects = sum(bool(x) for x in results)

      str = f"Model: {model}\tDataset name: {dataset_name}\t Prompt with context: {context_hypo_prompt}\t Prompt index: {prompt_index}\t - Correct Answers {corrects}/{len(results)} - {corrects/len(results)}"
      print(str)

    print("\n")

## scitail

In [None]:
test_on_datasets()

In [None]:
test_on_datasets(True)

##anli

In [None]:
test_on_datasets()

In [None]:
test_on_datasets(True)

##glue-mnli

In [None]:
test_on_datasets()

In [None]:
test_on_datasets(True)

##sick

In [None]:
test_on_datasets()

In [None]:
test_on_datasets(True)

## superglue-cb

In [None]:
test_on_datasets()

In [None]:
test_on_datasets(True)

## Example of model behavior with

In [None]:
sequence = pipeline(
    ' Chose one random letter between ["A","B","C"]',
      do_sample=True,
      top_k=10,
      num_return_sequences=1,
      #eos_token_id=tokenizer.eos_token_id,
      max_length=400
    )

In [None]:
sequence

## time evaluation

In [None]:
import time

In [None]:
def time_tests(test_mode = True):
  binary_task = False;
  for dataset_index in range (0, len(NLI_DATASETS)):
    if dataset_index == 0:
      binary_task = True;
      single_sentence_propmts_num = 0;
      prompt_num = len(NLI_TRUE_FALSE_PROMPTS);
    else:
      binary_task = False;
      single_sentence_propmts_num = len(NLI_PROMPTS_SINGLE_SENTENCE)
      prompt_num = single_sentence_propmts_num+len(NLI_PROMPT_CONTEXT_HYPO);

    prompt_index = best_prompts[dataset_index][0]
    question, answer = import_dataset(dataset_index, test_mode)

    if test_mode:
      question = question[0:32]
      answer = answer[0:32]

    context_hypo_prompt = prompt_index >= single_sentence_propmts_num

    prompt_index = prompt_index - single_sentence_propmts_num if context_hypo_prompt else prompt_index;

    start = time.time()
    results = askLlamaNLI(question, answer, prompt_index, context_hypo_prompt, binary_task)
    end = time.time()

    dataset_name = NLI_DATASETS[dataset_index]

    str = f"Model: {model}\tDataset name: {dataset_name}\t - Time: {end - start}"
    print(str)

In [None]:
time_tests()