In [1]:
import logging
logging.basicConfig(level='ERROR')

In [2]:
import pandas as pd

In [3]:
!gdown 1Kr5nfwQ1hdg4VkKPJNmljnc4kdqTyo83 -O SHROOM_test-unlabeled.zip

Downloading...
From: https://drive.google.com/uc?id=1Kr5nfwQ1hdg4VkKPJNmljnc4kdqTyo83
To: /home/slpl/SHROOM_test-unlabeled.zip
100%|█████████████████████████████████████████| 312k/312k [00:00<00:00, 587kB/s]


In [4]:
!unzip -o /home/slpl/SHROOM_test-unlabeled.zip

Archive:  /home/slpl/SHROOM_test-unlabeled.zip
  inflating: SHROOM_test-unlabeled/test.model-agnostic.json  
  inflating: SHROOM_test-unlabeled/test.model-aware.json  


01 : Running With LangChain

In [5]:
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [6]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [8]:
model_4bit = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto",quantization_config=quantization_config, )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [10]:
pipeline = pipeline(
        "text-generation",
        model=model_4bit,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=500,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [11]:
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
llm = HuggingFacePipeline(pipeline=pipeline)

In [12]:
template_mt = """<s>[INST] You are a helpful, respectful and honest assistant.
Answer the following question using this json format: {{answer: (yes, no or maybe), score: (an integer number between 1 and 5, which 1 is for not supported and 5 is for full supported), description: (a description for your answer)}}.
{question} Is the Sentence supported by the Context above? [/INST] </s>
"""
template_pg = """<s>[INST] You are a helpful, respectful and honest assistant.
Answer the following question using this json format: {{answer: (yes, no or maybe), score: (an integer number between 1 and 5, which 1 is for not supported and 5 is for full supported), description: (a description for your answer)}}.
{question} Is the Sentence supported by the Context above? [/INST] </s>
"""
template_dm = """<s>[INST] You are a helpful, respectful and honest assistant.
Answer the following question using this json format: {{answer: (yes, no or maybe), score: (an integer number between 1 and 5, which 1 is for not supported and 5 is for full supported), description: (a description for your answer)}}.
{question} Is the Sentence supported by the Context above? [/INST] </s>
"""

In [13]:
def chat_with_llm(question , template):
    prompt = PromptTemplate(template=template, input_variables=["question"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.run({"question":question})
    return response

In [14]:
run_on_test = True # whether this baseline system is ran on the test splits or the val splits

In [15]:
#from google.colab import drive
#drive.mount('/content/drive')

In [16]:
import json
import regex

Running on the model-aware track data

In [None]:
# change paths appropriately
# make sure the output filename is the same as the reference filename for the scoring program
path_val_model_aware = "/home/slpl/SHROOM_test-unlabeled/test.model-aware.json"
path_val_model_aware_output = "/home/slpl/results/test.model-aware.json"

#from datasets import load_dataset
import json
import random
import numpy as np
import tqdm.notebook as tqdm
seed_val = 442
random.seed(seed_val)
np.random.seed(seed_val)

# simple JSON loading
with open(path_val_model_aware, 'r') as istr:
    data_val_all = json.load(istr)
num_sample = len(data_val_all)
print(num_sample)
#num_sample = 5
output_json = []
labels = ["Not Hallucination", "Hallucination"]
"""
SelfCheckGPT Usage: (LLM) Prompt
https://github.com/potsawee/selfcheckgpt
Context: {}
Sentence: {}
Is the sentence supported by the context above?
Answer Yes or No:
"""
pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')
for i in tqdm.trange(num_sample):
    task = str(data_val_all[i]['task'])
    if run_on_test:
        # test splits will contain ids to ensure correct alignment before scoring
        id = int(data_val_all[i]['id'])
    hyp = str(data_val_all[i]['hyp'])
    src = str(data_val_all[i]['src'])
    tgt = str(data_val_all[i]['tgt'])

    if task == "PG":
        context = f"{src}"
    else: #i.e. task == "MT" or task == "DM":
        context = f"{tgt}"

    message = f"Context: {context}\nSentence: {hyp}\n"
    item_to_json=[]
    idx = 0
    for j in range(10):
      try:
        response = chat_with_llm(message, template_mt)
        answer = str(response.strip().lower()).replace("```", "")
        match = pattern.findall(answer)[0]
        print(match)
        p = json.loads(match)
        item_to_json.append(p)
        idx = idx + 1
        if (idx == 3):
          break;
      except:
        print("An exception occurred")

    if run_on_test:
        item_to_json.append({"id":id})

    output_json.append(item_to_json)

    f = open(path_val_model_aware_output, 'w', encoding='utf-8')
    json.dump(output_json, f)
    f.close()
print("done")

1500


  0%|          | 0/1500 [00:00<?, ?it/s]

  warn_deprecated(


{
  "answer": "no",
  "score": 1,
  "description": "the sentence 'a crazy person' is not supported by the context as it is a general and not specific enough. there are many different contexts that could potentially support the sentence 'a [insert specific context] is a crackpot or lunatic', but without further information, it cannot be determined if this sentence is supported."
}
{
    "answer": "maybe",
    "score": 3,
    "description": "the context provided is ambiguous and could potentially be applied to the word 'crazy' in different ways. for example, it could be seen as implying that the person is not sane or rational, but it could also be seen as implying that the person has unconventional or creative ideas. without further context, it is difficult to definitively determine the level of support for the statement that a crazy person is a crackpot or lunatic."
}
{
answer: yes,
score: 5,
description: the context of "crazy person" implies a person with erratic or irrational behavior



An exception occurred
An exception occurred




An exception occurred
An exception occurred




An exception occurred




{
  "answer": "yes",
  "score": 5,
  "description": "the context mentions the east african islands, which are located in the eastern part of the indian ocean and are part of the eastern islands of the indian ocean."
}




An exception occurred




An exception occurred
An exception occurred




{
"answer": "maybe",
"score": 4,
"description": "the sentence describes the location of the eastern islands in relation to the eastern part of the indian ocean, but does not specifically mention the islands being in the east african islands. the context provides additional information about the general location of the islands in the indian ocean, which is partially supported by the sentence."
}




{
answer: maybe,
score: 3,
description: "the sentence is partially supported by the context. the eastern islands of the indian ocean are indeed located in the eastern part of the indian ocean, but the location of the east african islands within the indian ocean is not explicitly mentioned in the context."
}
An exception occurred




An exception occurred


