# 計算 一次生成三個 distractor 的模型，在 Ambiguous Options 的表現，資料使用 ChatGPT 生成的 Ambiguous Options

### GPU

In [74]:
!nvidia-smi

Sun Jul 23 21:34:46 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090         On | 00000000:01:00.0 Off |                  Off |
| 71%   62C    P2              318W / 450W|  20689MiB / 24564MiB |     87%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090         On | 00000000:04:00.0 Off |  

### Weight and Bias (Assisting Metrics, Optional)

In [75]:
!pip install wandb
!wandb login
project_name = "Evaluate Ambiguous Options Probability"
import os

os.environ["WANDB_PROJECT"] = project_name

Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Using cached urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.0.4
    Uninstalling urllib3-2.0.4:
      Successfully uninstalled urllib3-2.0.4
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

sentry-sdk 1.22.2 requires urllib3>=1.26.11; python_version >= "3.6", but you'll have urllib3 1.25.11 which is incompatible.[0m
Successfully installed urllib3-1.25.11
[34m[1mwandb[0m: Currently logged in as: [33mreily[0m ([33mblurr[0m). Use [1m`wandb login --relogin`[0m to force relogin


### import & device use GPU

In [76]:
from transformers import T5Tokenizer
import torch
import json
from pprint import pprint
num_gpus = torch.cuda.device_count()
print(f'Detect {num_gpus} GPUS')
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

Detect 2 GPUS


### Loading the dataset

In [77]:
def read_chatGPT_data():
    path = '../data/chatGPT_answer/cloth-f-fit-answer-no-ans_3.json'
    with open(path) as f:
        data = json.load(f)
    return data
chatGPT_answer_data = read_chatGPT_data()
train = chatGPT_answer_data['train']
valid = chatGPT_answer_data['valid']
test = chatGPT_answer_data['eval']

## 處理 load 進來的資料

### Prepare data

In [80]:
test[0]

{'sentence': 'I met Kurt Kampmeir of Success Motivation Incorporation for breakfast. While we were _ ,Kurt askedme, " John, what is your plan for personal growth? Never at a loss for words, I tried to find things in my life that might qualify for growth.I toldhim about the many activities in which I was involved . ',
 'distractors': ['preparing', 'thinking', 'working'],
 'answer': 'eating',
 'index': 0,
 'sorted_distractors': ['thinking', 'preparing', 'working', 'eating'],
 'ranked_distractors': ['thinking',
  'preparing',
  'working',
  'chatting',
  'conversing',
  'discussing']}

In [82]:
# 一個單字生成一次，所以需
def make_model_input(data):
    list_distractors = []
    model_input_sentences = []
    answers = []
    sentences = []
    # alike_answer_list = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']
        # alike_answer = d['ranked_distractors'][3:]
        model_input_sentence = sentence + '</s>' + answer
        # model_input_sentence = sentence
        
        sentences.append(sentence)
        list_distractors.append(distractors)
        answers.append(answer)
        model_input_sentences.append(model_input_sentence)
        # alike_answer_list.append(alike_answer)
        
    return sentences, list_distractors, answers, model_input_sentences

train_sentences, train_distractors, train_answers, train_sent = make_model_input(train)
valid_sentences, valid_distractors, valid_answers, valid_sent = make_model_input(valid)
test_sentences, test_distractors, test_answers, test_sent = make_model_input(test) # test_sent 為五句一組，所以batch size 只能是 5 要不然會混到其他生出來的

In [84]:
len(test_sent), len(test_answers), len(test_distractors)

(9371, 9371, 9371)

In [85]:
pprint(test_sent[0])

('I met Kurt Kampmeir of Success Motivation Incorporation for breakfast. While '
 'we were _ ,Kurt askedme, " John, what is your plan for personal growth? '
 'Never at a loss for words, I tried to find things in my life that might '
 'qualify for growth.I toldhim about the many activities in which I was '
 'involved . </s>eating')


In [86]:
pprint(test_distractors[0])

['preparing', 'thinking', 'working']


In [87]:
pprint(test_answers[0])

'eating'


In [88]:
tokenizer = T5Tokenizer.from_pretrained("../t5-training/model/t5-base-clean-sent-ans-tripleD-,split")

In [89]:
train_encodings = tokenizer(train_sent, truncation=True, padding=True, return_tensors="pt").to(device)
valid_encodings = tokenizer(valid_sent, truncation=True, padding=True, return_tensors="pt").to(device)
test_encodings = tokenizer(test_sent, truncation=True, padding=True, return_tensors="pt").to(device)

In [90]:
test_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [91]:
print(test_encodings.input_ids[0])

tensor([   27,  1736,  8333,    17,  8329,  2028,    15,    23,    52,    13,
        16581, 25279,    86, 14723,   127,   257,    21,  3688,     5,   818,
           62,   130,     3,   834,     3,     6,   439,   450,    17,  1380,
          526,     6,    96,  1079,     6,   125,    19,    39,   515,    21,
          525,  1170,    58,  8400,    44,     3,     9,  1453,    21,  1234,
            6,    27,  1971,    12,   253,   378,    16,    82,   280,    24,
          429,  9448,    21,  1170,     5,   196,  1219, 10813,    81,     8,
          186,  1087,    16,    84,    27,    47,  1381,     3,     5,     1,
         3182,     1,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [92]:
len(test_encodings.input_ids[0])

500

In [93]:
tokenizer.decode(test_encodings.input_ids[0])

'I met Kurt Kampmeir of Success Motivation Incorporation for breakfast. While we were _,Kurt askedme, " John, what is your plan for personal growth? Never at a loss for words, I tried to find things in my life that might qualify for growth.I toldhim about the many activities in which I was involved.</s> eating</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [94]:
len(test_encodings.input_ids)

9371

In [95]:
len(test_encodings.input_ids)

9371

In [96]:
test_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [97]:
# print(test_encodings.labels[0])

In [98]:
class ClothDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].detach().clone() for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = ClothDataset(train_encodings)
valid_dataset = ClothDataset(valid_encodings)
test_dataset = ClothDataset(test_encodings)

In [99]:
class ClothDatasetcpu(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].detach().cpu().clone() for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)
test_cpu_dataset = ClothDatasetcpu(test_encodings)

In [100]:
type(test_dataset)

__main__.ClothDataset

In [101]:
test_dataset[0]

{'input_ids': tensor([   27,  1736,  8333,    17,  8329,  2028,    15,    23,    52,    13,
         16581, 25279,    86, 14723,   127,   257,    21,  3688,     5,   818,
            62,   130,     3,   834,     3,     6,   439,   450,    17,  1380,
           526,     6,    96,  1079,     6,   125,    19,    39,   515,    21,
           525,  1170,    58,  8400,    44,     3,     9,  1453,    21,  1234,
             6,    27,  1971,    12,   253,   378,    16,    82,   280,    24,
           429,  9448,    21,  1170,     5,   196,  1219, 10813,    81,     8,
           186,  1087,    16,    84,    27,    47,  1381,     3,     5,     1,
          3182,     1,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

# Inference 

In [102]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained("../t5-training/model/t5-base-clean-sent-ans-tripleD-,split")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [103]:
import copy
test_sample = copy.deepcopy(test)

In [104]:
distractor_num = 6
no_repeat_ngram = 3
batch_size = 1

In [105]:
test_dataLoader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

In [107]:
test_sample[0]

{'sentence': 'I met Kurt Kampmeir of Success Motivation Incorporation for breakfast. While we were _ ,Kurt askedme, " John, what is your plan for personal growth? Never at a loss for words, I tried to find things in my life that might qualify for growth.I toldhim about the many activities in which I was involved . ',
 'distractors': ['preparing', 'thinking', 'working'],
 'answer': 'eating',
 'index': 0,
 'sorted_distractors': ['thinking', 'preparing', 'working', 'eating'],
 'ranked_distractors': ['thinking',
  'preparing',
  'working',
  'chatting',
  'conversing',
  'discussing']}

In [108]:
import re
def keep_only_letters(text):
    pattern = r"[^a-zA-Z]"
    letters_only = re.sub(pattern, "", text)
    return letters_only

In [109]:
"""
評估實驗結果的指標
以克漏句子為單位計算
"""

import json
import argparse
from math import log

def cala_repeat(pred_distractors, d_isRepeat):
    n = len(pred_distractors)
    for i in range(n):
        dis_set = set(pred_distractors[i])
        repeat = len(pred_distractors[i]) - len(dis_set)
        if repeat >= 3:
            d_isRepeat[3] +=1
        else:
            d_isRepeat[repeat] += 1

def cala_answer(answers_text, pred_distractors, d_isAnswer):
    n = len(answers_text)
    for i in range(n):
        if i < len(pred_distractors):
            cnt = 0
            for dis in pred_distractors[i]:
                if dis == answers_text[i]:
                    cnt += 1
            d_isAnswer[cnt] += 1

def eval_idcg(actual, predicted, k):
    idcg = 0.
    
    ideal = [1 if pred in actual else 0 for pred in predicted]
    ideal.sort(reverse=True)
    for i in range(1, k+1):
        rel = ideal[i-1]
        idcg += rel / log(i+1, 2)
    return idcg

def eval_dcg(actual, predicted, k):
    dcg = 0.
    for i in range(1, k+1):
        rel = 0
        if predicted[i-1] in actual:
            rel = 1
        dcg += rel / log(i+1, 2)
    return dcg


def eval_map(actual, predicted):
    _map = 0.
    n = len(actual)
    for qid in range(n):
        candidates = predicted[qid]
        val = 0.
        rank = 1
        for i in range(1, len(candidates)+1):
            if candidates[i-1] in actual[qid]:
                val += i / rank
                rank += 1
        _map += val / len(actual)
    return _map

def eval_mrr(actual, predicted):
    mrr = 0.
    for i in range(1, len(predicted)+1):
        if predicted[i-1] in actual:
                mrr += 1.0 / i
                break
    return mrr

def eval_recall(actual, predicted, k):
    recall = 0.
    act_set = set(actual)
    pred_set = set(predicted[:k])
    recall += len(act_set & pred_set) / float(len(act_set))
    return recall

def eval_precision(actual, predicted, k):
    prec = 0.
    act_set = set(actual)
    pred_set = set(predicted[:k])
    prec += len(act_set & pred_set) / float(k)
    return prec


def eval_ndcg(actual, predicted, k):
    ndcg = 0.
    dcg = eval_dcg(actual, predicted, k)
    idcg = eval_idcg(actual, predicted, k)
    if idcg != 0:
        ndcg += dcg / idcg
    return ndcg

def process(actual, predicted):
    n = len(actual)
    for qid in range(n):
        if qid >= len(predicted):
            lst = []
            for k in range(len(actual[qid])):
                lst.append("")
            predicted.append(lst)
        else:
            if len(actual[qid]) != len(predicted[qid]):
                diff = len(actual[qid])-len(predicted[qid])
                for k in range(diff):
                    predicted[qid].append("")


In [110]:
print(len(test_sample))

9371


## 計算模型的表現

In [111]:
sentence_count = 0
spare_num = 10
return_num = 1
model.eval()
answers_prob = 0.0
distractors_prob = 0.0
max_words_prob = 0.0

one_word_prob = 0.0
two_word_prob = 0.0
three_word_prob = 0.0
answer_num = 0

not_related_prob = 0.0
not_related_word = ["fan"]
n_question = len(test_dataLoader)

p1 = 0.
p3 = 0.
r1 = 0.
r3 = 0.
r10 = 0.
mrr = 0.
_map = 0.
ndcg3 = 0.
ndcg10 = 0.
f3 = 0.
alike3 = 0.

a3 = 0
with torch.no_grad():
    for batch in test_dataLoader:
        
        distractor_prob = 0.0
        three_word_list = []
        answer_list = [test_sample[sentence_count]['answer']]
        distractor_list = test_sample[sentence_count]['distractors']
        alike_list = test_sample[sentence_count]['ranked_distractors'][3:]
        
        answer_set = set(answer_list)
        distractor_set = set(distractor_list)
        alike_set = set(alike_list)
        
        output = model.generate(
            input_ids = batch['input_ids'][:],  
            attention_mask = batch['attention_mask'][:],
        )
        batch_output = tokenizer.batch_decode(output, skip_special_tokens=True)
        
        pred_dist = []
        for b in batch_output:
            b_list = b.split(", ")
            b_list[0] = b_list[0].split(' ')[-1]
            for i in b_list:
                i_trim = keep_only_letters(i)
                if i_trim not in pred_dist:
                    pred_dist.append(i_trim)
        
        p1 += eval_precision(distractor_list, pred_dist, 1)
        p3_score = eval_precision(distractor_list, pred_dist, 3)
        p3 += p3_score
        r1_score = eval_recall(distractor_list, pred_dist, 1)
        r1 += r1_score
        # r10 += eval_recall(distractor_list, pred_dist, 10)
        if len(pred_dist) > 2:
            ndcg3 += eval_ndcg(distractor_list, pred_dist, 3)
        # ndcg10 += eval_ndcg(distractor_list, pred_dist, 10)
        mrr += eval_mrr(distractor_list, pred_dist)
        pred3_set = set(pred_dist[:3])

        p_3 = len(distractor_set & pred3_set) / float(3)
        r_3 = len(distractor_set & pred3_set) / float(len(distractor_set))
        a_3 = len(answer_set & pred3_set) / float(1)
        alike_3 = len(alike_set & pred3_set) / float(3)
        
        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        r3+=r_3
        f3+=f1_3
        a3+=a_3
        alike3 += alike_3
        
        sentence_count += 1
        
assert sentence_count == n_question, f"sentence length not match"    

p1 = p1 / n_question
p3 = p3 / n_question
r1 = r1 / n_question
r3 = r3 / n_question
# r10 = r10 / n_question
ndcg3 = ndcg3 / n_question
# ndcg10 = ndcg10 / n_question
f3 = f3 / n_question
a3 = a3 / n_question
mrr = mrr / n_question
alike3 = alike3 / n_question

result = {
    'P@1': p1,
    'P@3': p3,
    'R@1': r1,
    'R@3': r3,
    # 'R@10': r10,
    'NDCG@3': ndcg3,
    # 'NDCG@10': ndcg10,
    'MRR': mrr,
    'F1@3': f3,
    'A@3': a3,
    'alike@3': alike3,
    'n_question': n_question,
    }
       
print(result)

{'P@1': 0.27008857112367946, 'P@3': 0.1829402767402968, 'R@1': 0.09004730907409661, 'R@3': 0.18299363283889888, 'NDCG@3': 0.22640698584644328, 'MRR': 0.3278554405435215, 'F1@3': 0.18296161917973763, 'A@3': 0.09593426528652226, 'alike@3': 0.05143527905239503, 'n_question': 9371}
