#### Set Up SelfCheckGPT

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
import modeling_selfcheck
import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
selfcheck_nli = modeling_selfcheck.SelfCheckNLI(device=device)

#### Load Wikibio Dataset

In [None]:
import json

with open("data/dataset_v3.json", "r") as f:
    content = f.read()
    
dataset = json.loads(content)
print("The length of the dataset: {}".format(len(dataset)))

In [None]:
import numpy as np

label_mapping = {
    'accurate': 0.0,
    'minor_inaccurate': 0.5,
    'major_inaccurate': 1.0,
}

human_label_detect_False   = {}
human_label_detect_True    = {}
human_label_detect_False_h = {}

for i_ in range(len(dataset)):
    dataset_i = dataset[i_]
    idx = dataset_i["wiki_bio_test_idx"]
    raw_label = np.array([label_mapping[x] for x in dataset_i['annotation']])
    
    human_label_detect_False[idx] = (raw_label > 0.499).astype(np.int32).tolist()
    human_label_detect_True[idx] = (raw_label < 0.499).astype(np.int32).tolist()
    average_score = np.mean(raw_label)
    if (average_score < 0.99):
        human_label_detect_False_h[idx] = (raw_label > 0.99).astype(np.int32).tolist()
        
print("Length of False:", len(human_label_detect_False))
print("Length of True:", len(human_label_detect_True)) 
print("Length of False_h:", len(human_label_detect_False_h))

#### SelfCheck NLI

In [None]:
from tqdm import tqdm

scores_nli = {}
scores_nli_json = {}

for i in tqdm(range(len(dataset))):
    x = dataset[i]
    idx = dataset[i]['wiki_bio_test_idx'] 

    scores_nli[idx] = selfcheck_nli.predict(
        sentences = x['gpt3_sentences'],           
        sampled_passages = x['gpt3_text_samples'],
    )
    
for idx in scores_nli:
  scores = scores_nli[idx]
  scores_nli_json[idx] = [score for score in scores]
  
with open("data/scores_nli.json", "w") as outfile:
    json.dump(scores_nli_json, outfile)