# 計算 model 中 被認為是 answer 的 token 的出現機率

### GPU

In [1]:
!nvidia-smi

Sun Aug 27 13:57:40 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.10              Driver Version: 535.86.10    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
| 44%   49C    P8              24W / 450W |      6MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:04:00.0 Off |  

### Weight and Bias (Assisting Metrics, Optional)

In [2]:
!pip install wandb
!wandb login
project_name = "Evaluate Ambiguous Options Probability"
import os

os.environ["WANDB_PROJECT"] = project_name

[34m[1mwandb[0m: Currently logged in as: [33mreily[0m ([33mblurr[0m). Use [1m`wandb login --relogin`[0m to force relogin


### import & device use GPU

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import json
from pprint import pprint
num_gpus = torch.cuda.device_count()
print(f'Detect {num_gpus} GPUS')
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

Detect 2 GPUS


### Loading the dataset

In [4]:
def read_chatGPT_data():
    path = '../data/chatGPT_answer/cloth-f-fit-answer-no-ans_3.json'
    with open(path) as f:
        data = json.load(f)
    return data
chatGPT_answer_data = read_chatGPT_data()
train = chatGPT_answer_data['train']
valid = chatGPT_answer_data['valid']
test = chatGPT_answer_data['eval']

## 處理 load 進來的資料

### Prepare data

In [7]:
def make_model_input(data):
    list_distractors = []
    model_input_sentences = []
    answers = []
    sentences = []
    alike_answer_list = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']
        alike_answer = d['ranked_distractors'][3:]
        model_input_sentence = sentence + '</s>' + answer
        
        sentences.append(sentence)
        list_distractors.append(distractors)
        answers.append(answer)
        model_input_sentences.append(model_input_sentence)
        alike_answer_list.append(alike_answer)
        
    return sentences, list_distractors, answers, alike_answer_list, model_input_sentences

train_sentences, train_distractors, train_answers, train_alike_answer, train_sent = make_model_input(train)
valid_sentences, valid_distractors, valid_answers, valid_alike_answer, valid_sent = make_model_input(valid)
test_sentences, test_distractors, test_answers, test_alike_answer, test_sent = make_model_input(test)

# 先定義算分方式

In [9]:
import re
def keep_only_letters(text):
    pattern = r"[^a-zA-Z]"
    letters_only = re.sub(pattern, "", text)
    return letters_only

In [10]:
"""
評估實驗結果的指標
以克漏句子為單位計算
"""

import json
import argparse
from math import log

def cala_repeat(pred_distractors, d_isRepeat):
    n = len(pred_distractors)
    for i in range(n):
        dis_set = set(pred_distractors[i])
        repeat = len(pred_distractors[i]) - len(dis_set)
        if repeat >= 3:
            d_isRepeat[3] +=1
        else:
            d_isRepeat[repeat] += 1

def cala_answer(answers_text, pred_distractors, d_isAnswer):
    n = len(answers_text)
    for i in range(n):
        if i < len(pred_distractors):
            cnt = 0
            for dis in pred_distractors[i]:
                if dis == answers_text[i]:
                    cnt += 1
            d_isAnswer[cnt] += 1

def eval_idcg(actual, predicted, k):
    idcg = 0.
    
    ideal = [1 if pred in actual else 0 for pred in predicted]
    ideal.sort(reverse=True)
    for i in range(1, k+1):
        rel = ideal[i-1]
        idcg += rel / log(i+1, 2)
    return idcg

def eval_dcg(actual, predicted, k):
    dcg = 0.
    for i in range(1, k+1):
        rel = 0
        if predicted[i-1] in actual:
            rel = 1
        dcg += rel / log(i+1, 2)
    return dcg


def eval_map(actual, predicted):
    _map = 0.
    n = len(actual)
    for qid in range(n):
        candidates = predicted[qid]
        val = 0.
        rank = 1
        for i in range(1, len(candidates)+1):
            if candidates[i-1] in actual[qid]:
                val += i / rank
                rank += 1
        _map += val / len(actual)
    return _map

def eval_mrr(actual, predicted):
    mrr = 0.
    for i in range(1, len(predicted)+1):
        if predicted[i-1] in actual:
                mrr += 1.0 / i
                break
    return mrr

def eval_recall(actual, predicted, k):
    recall = 0.
    act_set = set(actual)
    pred_set = set(predicted[:k])
    recall += len(act_set & pred_set) / float(len(act_set))
    return recall

def eval_precision(actual, predicted, k):
    prec = 0.
    act_set = set(actual)
    pred_set = set(predicted[:k])
    prec += len(act_set & pred_set) / float(k)
    return prec


def eval_ndcg(actual, predicted, k):
    ndcg = 0.
    dcg = eval_dcg(actual, predicted, k)
    idcg = eval_idcg(actual, predicted, k)
    if idcg != 0:
        ndcg += dcg / idcg
    return ndcg

def process(actual, predicted):
    n = len(actual)
    for qid in range(n):
        if qid >= len(predicted):
            lst = []
            for k in range(len(actual[qid])):
                lst.append("")
            predicted.append(lst)
        else:
            if len(actual[qid]) != len(predicted[qid]):
                diff = len(actual[qid])-len(predicted[qid])
                for k in range(diff):
                    predicted[qid].append("")


In [11]:
class ClothDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].detach().clone() for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)
    
class ClothDatasetcpu(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].detach().cpu().clone() for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [12]:
import copy
test_sample = copy.deepcopy(test)
juan100_sample = copy.deepcopy(juan100)
distractor_num = 10
batch_size = 2

In [13]:
def compute_score(tokenizer,model,test_dataLoader,test_sample,distractor_num):
    sentence_count = 0
    n_question = len(test_sample)
    p1 = 0.
    p3 = 0.
    r1 = 0.
    r3 = 0.
    r10 = 0.
    mrr = 0.
    _map = 0.
    ndcg3 = 0.
    ndcg10 = 0.
    f3 = 0.
    alike3 = 0.
    a3 = 0
    model.eval()
    with torch.no_grad():
        for batch in test_dataLoader:
            
            output = model.generate(
                input_ids = batch['input_ids'][:],  
                attention_mask = batch['attention_mask'][:],
                num_beams=12,
                num_return_sequences=10,
            )
            batch_output = tokenizer.batch_decode(output, skip_special_tokens=True)
            # print(batch_output)
            slice_num = len(batch_output)//distractor_num
        
            batch_slice = [batch_output[i*(distractor_num):(i+1)*distractor_num] for i in range(slice_num)]
            
            for bsl in batch_slice:
                pred_dist = []
                for b in bsl:
                    pred_dist.append(keep_only_letters(b))

                answer_list = [test_sample[sentence_count]['answer']]
                distractor_list = test_sample[sentence_count]['distractors']
                alike_list = test_sample[sentence_count]['ranked_distractors'][3:]
                
                answer_set = set(answer_list)
                distractor_set = set(distractor_list)
                alike_set = set(alike_list)
                
                p1 += eval_precision(distractor_list, pred_dist, 1)
                p3_score = eval_precision(distractor_list, pred_dist, 3)
                p3 += p3_score
                r1_score = eval_recall(distractor_list, pred_dist, 1)
                r1 += r1_score
                r10 += eval_recall(distractor_list, pred_dist, 10)
                ndcg3 += eval_ndcg(distractor_list, pred_dist, 3)
                ndcg10 += eval_ndcg(distractor_list, pred_dist, 10)
                mrr += eval_mrr(distractor_list, pred_dist)
                pred3_set = set(pred_dist[:3])

                p_3 = len(distractor_set & pred3_set) / float(3)
                r_3 = len(distractor_set & pred3_set) / float(len(distractor_set))
                a_3 = len(answer_set & pred3_set) / float(1)
                alike_3 = len(alike_set & pred3_set) / float(3)
                
                
                if p_3 == 0 and r_3 == 0:
                    f1_3 = 0
                else:
                    f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

                r3+=r_3
                f3+=f1_3
                a3+=a_3
                alike3 += alike_3
                
                sentence_count += 1
            
    assert sentence_count == n_question, f"sentence length not match"    

    p1 = p1 / n_question
    p3 = p3 / n_question
    r1 = r1 / n_question
    r3 = r3 / n_question
    r10 = r10 / n_question
    ndcg3 = ndcg3 / n_question
    ndcg10 = ndcg10 / n_question
    f3 = f3 / n_question
    a3 = a3 / n_question
    mrr = mrr / n_question
    alike3 = alike3 / n_question

    result = {
        'P@1': p1,
        'P@3': p3,
        'R@1': r1,
        'R@3': r3,
        'R@10': r10,
        'NDCG@3': ndcg3,
        'NDCG@10': ndcg10,
        'MRR': mrr,
        'F1@3': f3,
        'A@3': a3,
        'alike@3': alike3,
        'n_question': n_question,
        }
    return result

In [None]:
from os import listdir
name = "your_DACLDG_project_name"
write_file = "stastic_chatgpt_{}.jsonl".format(name)
f = open(write_file,"a",encoding="UTF-8")
mypath = "../model/t5_DACL-DG/{}".format(name)
mydir = listdir(mypath)
# mydir = ["checkpoint-2659"]
for i in mydir:
    myfolder = "{}/{}".format(mypath,i)
    print(myfolder)
    tokenizer = T5Tokenizer.from_pretrained(myfolder)
    
    test_encodings = tokenizer(test_sent, truncation=True, padding=True, return_tensors="pt").to(device)
    
    test_dataset = ClothDataset(test_encodings)
    
    test_cpu_dataset = ClothDatasetcpu(test_encodings)
    
    model = T5ForConditionalGeneration.from_pretrained(myfolder)
    model.to(device)
    
    test_dataLoader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = False)
    score = compute_score(tokenizer,model,test_dataLoader,test_sample,distractor_num)
    score["name"] = i
    
    json_data = json.dumps(score, ensure_ascii=False) + "\n"
    f.write(json_data)
f.close()