# 計算 model 中 被認為是 answer 的 token 的出現機率

### GPU

In [39]:
!nvidia-smi

Tue Jul 18 11:50:15 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090         On | 00000000:01:00.0 Off |                  Off |
|  0%   32C    P8               20W / 450W|  12537MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090         On | 00000000:04:00.0 Off |  

### Weight and Bias (Assisting Metrics, Optional)

In [40]:
!pip install wandb
!wandb login
project_name = "Evaluate Ambiguous Options Probability"
import os

os.environ["WANDB_PROJECT"] = project_name

[34m[1mwandb[0m: Currently logged in as: [33mreily[0m ([33mblurr[0m). Use [1m`wandb login --relogin`[0m to force relogin


### import & device use GPU

In [41]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import json
from pprint import pprint

num_gpus = torch.cuda.device_count()
print(f'Detect {num_gpus} GPUS')
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

Detect 2 GPUS


### Loading the dataset

In [42]:
def read_data(item):
    path = '../data/response_extract/{}_distractor_rank_alike.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [43]:
def read_clothf_data():
    path = '../data/CLOTH-F/clean_cloth-f_dataset.json'
    with open(path) as f:
        data = json.load(f)
    return data

chatGPT_answer_data = read_clothf_data()
train = chatGPT_answer_data['train']
valid = chatGPT_answer_data['valid']
test = chatGPT_answer_data['test']

## 處理 load 進來的資料

### Prepare data

In [44]:
test[0]

{'sentence': 'I met Kurt Kampmeir of Success Motivation Incorporation for breakfast. While we were _ ,Kurt askedme, " John, what is your plan for personal growth? Never at a loss for words, I tried to find things in my life that might qualify for growth.I toldhim about the many activities in which I was involved . ',
 'distractors': ['working', 'preparing', 'thinking'],
 'answer': 'eating',
 'index': 0}

In [45]:
# 一個單字生成一次，所以需
def make_model_input(data, task_prefix):
    list_distractors = []
    model_input_sentences = []
    answers = []
    sentences = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']
        model_input_sentence = task_prefix + sentence + '</s>' + answer
        list_distractor = '_ of distractors are ' + ', '.join(distractors)
    
        sentences.append(sentence)
        list_distractors.append(list_distractor)
        answers.append(answer)
        model_input_sentences.append(model_input_sentence)
    return sentences, list_distractors, answers, model_input_sentences, list_distractors

task_prefix = 'distractor generation: '
train_sentences, train_distractors, train_answers, train_sent, train_list_distractors = make_model_input(train, task_prefix)
valid_sentences, valid_distractors, valid_answers, valid_sent, valid_list_distractors = make_model_input(valid, task_prefix)
test_sentences, test_distractors, test_answers, test_sent, test_list_distractors = make_model_input(test, task_prefix) # test_sent 為五句一組，所以batch size 只能是 5 要不然會混到其他生出來的

In [46]:
len(test_sent), len(test_answers), len(test_distractors)

(10233, 10233, 10233)

In [47]:
pprint(test_sent[0])
pprint(test_distractors[0])

('distractor generation: I met Kurt Kampmeir of Success Motivation '
 'Incorporation for breakfast. While we were _ ,Kurt askedme, " John, what is '
 'your plan for personal growth? Never at a loss for words, I tried to find '
 'things in my life that might qualify for growth.I toldhim about the many '
 'activities in which I was involved . </s>eating')
'_ of distractors are working, preparing, thinking'


In [48]:
tokenizer = T5Tokenizer.from_pretrained("../model/t5-distractor-generation-multitask-len3")

In [49]:
train_encodings = tokenizer(train_sent, truncation=True, padding=True, text_target=train_list_distractors, return_tensors="pt").to(device)
valid_encodings = tokenizer(valid_sent, truncation=True, padding=True, text_target=valid_list_distractors, return_tensors="pt").to(device)
test_encodings = tokenizer(test_sent, truncation=True, padding=True, text_target=test_list_distractors, return_tensors="pt").to(device)

In [50]:
test_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [51]:
print(test_encodings.input_ids[0])

tensor([15980,   127,  3381,    10,    27,  1736,  8333,    17,  8329,  2028,
           15,    23,    52,    13, 16581, 25279,    86, 14723,   127,   257,
           21,  3688,     5,   818,    62,   130,     3,   834,     3,     6,
          439,   450,    17,  1380,   526,     6,    96,  1079,     6,   125,
           19,    39,   515,    21,   525,  1170,    58,  8400,    44,     3,
            9,  1453,    21,  1234,     6,    27,  1971,    12,   253,   378,
           16,    82,   280,    24,   429,  9448,    21,  1170,     5,   196,
         1219, 10813,    81,     8,   186,  1087,    16,    84,    27,    47,
         1381,     3,     5,     1,  3182,     1,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [52]:
len(test_encodings.input_ids[0])

504

In [53]:
tokenizer.decode(test_encodings.input_ids[0])

'distractor generation: I met Kurt Kampmeir of Success Motivation Incorporation for breakfast. While we were _,Kurt askedme, " John, what is your plan for personal growth? Never at a loss for words, I tried to find things in my life that might qualify for growth.I toldhim about the many activities in which I was involved.</s> eating</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [54]:
len(test_encodings.input_ids)

10233

In [55]:
len(test_encodings.input_ids)

10233

In [56]:
test_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [57]:
print(test_encodings.labels[0])

tensor([    3,   834,    13,  1028, 29676,    33,   464,     6,     3,  6955,
            6,  1631,     1,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0], device='cuda:0')


In [58]:
class ClothDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].detach().clone() for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = ClothDataset(train_encodings)
valid_dataset = ClothDataset(valid_encodings)


In [59]:
class ClothDatasetcpu(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].detach().cpu().clone() for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)
test_dataset = ClothDatasetcpu(test_encodings)

In [60]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
model = T5ForConditionalGeneration.from_pretrained("../model/t5-distractor-generation-multitask-len3")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

### 使用　trainer 做 evaluation

In [61]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    output_dir = "results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="P@1",
    num_train_epochs=10,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [62]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [63]:
def get_sent_cnt(labels):
    sent_cnt_dic = {len(train_distractors): train_sent, len(valid_distractors): valid_sent, len(test_distractors): test_sent}
    return sent_cnt_dic[len(labels)]

In [64]:
import re
def keep_only_letters(text):
    pattern = r"[^a-zA-Z]"
    letters_only = re.sub(pattern, "", text)
    return letters_only

In [65]:
"""
評估實驗結果的指標
以克漏句子為單位計算
"""

import json
import argparse
from math import log

def cala_repeat(pred_distractors, d_isRepeat):
    n = len(pred_distractors)
    for i in range(n):
        dis_set = set(pred_distractors[i])
        repeat = len(pred_distractors[i]) - len(dis_set)
        if repeat >= 3:
            d_isRepeat[3] +=1
        else:
            d_isRepeat[repeat] += 1

def cala_answer(answers_text, pred_distractors, d_isAnswer):
    n = len(answers_text)
    for i in range(n):
        if i < len(pred_distractors):
            cnt = 0
            for dis in pred_distractors[i]:
                if dis == answers_text[i]:
                    cnt += 1
            d_isAnswer[cnt] += 1

def eval_idcg(actual, predicted, k):
    idcg = 0.
    
    ideal = [1 if pred in actual else 0 for pred in predicted]
    ideal.sort(reverse=True)
    for i in range(1, k+1):
        rel = ideal[i-1]
        idcg += rel / log(i+1, 2)
    return idcg

def eval_dcg(actual, predicted, k):
    dcg = 0.
    for i in range(1, k+1):
        rel = 0
        if predicted[i-1] in actual:
            rel = 1
        dcg += rel / log(i+1, 2)
    return dcg


def eval_map(actual, predicted):
    _map = 0.
    n = len(actual)
    for qid in range(n):
        candidates = predicted[qid]
        val = 0.
        rank = 1
        for i in range(1, len(candidates)+1):
            if candidates[i-1] in actual[qid]:
                val += i / rank
                rank += 1
        _map += val / len(actual)
    return _map

def eval_mrr(actual, predicted):
    mrr = 0.
    for i in range(1, len(predicted)+1):
        if predicted[i-1] in actual:
                mrr += 1.0 / i
                break
    return mrr

def eval_recall(actual, predicted, k):
    recall = 0.
    act_set = set(actual)
    pred_set = set(predicted[:k])
    recall = len(act_set & pred_set) / float(len(act_set))
    return recall

def eval_precision(actual, predicted, k):
    prec = 0.
    act_set = set(actual)
    pred_set = set(predicted[:k])
    prec += len(act_set & pred_set) / float(k)
    return prec


def eval_ndcg(actual, predicted, k):
    ndcg = 0.
    dcg = eval_dcg(actual, predicted, k)
    idcg = eval_idcg(actual, predicted, k)
    if idcg != 0:
        ndcg += dcg / idcg
    return ndcg

def process(actual, predicted):
    n = len(actual)
    for qid in range(n):
        if qid >= len(predicted):
            lst = []
            for k in range(len(actual[qid])):
                lst.append("")
            predicted.append(lst)
        else:
            if len(actual[qid]) != len(predicted[qid]):
                diff = len(actual[qid])-len(predicted[qid])
                for k in range(diff):
                    predicted[qid].append("")


In [66]:
import copy
test_sample = copy.deepcopy(test)

In [67]:
import numpy as np
def compute_metrics(p):
    sentence_count = 0
    p1 = 0.
    p3 = 0.
    r1 = 0.
    r3 = 0.
    r10 = 0.
    mrr = 0.
    _map = 0.
    ndcg3 = 0.
    ndcg10 = 0.
    f3 = 0.
    
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    labels = []
    
    n_question = len(decoded_labels)


    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split(' ')[-1]
        label_list[0] = label_list[0].split(' ')[-1]

        
        act_set = set(label_list)
        pred1_set = set(pred_list[:1])
        pred3_set = set(pred_list[:3])

        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_1 = len(act_set & pred1_set) / float(len(act_set))
        r_3 = len(act_set & pred3_set) / float(len(act_set))
        
        
        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        p1+=p_1
        p3+=p_3
        r1+=r_1
        r3+=r_3
        f3+=f1_3
            
        
        sentence_count += 1
        
    assert sentence_count == n_question, f"sentence length not match"    

    p1 = p1 / n_question
    p3 = p3 / n_question
    r1 = r1 / n_question
    r3 = r3 / n_question
    # r10 = r10 / n_question
    # ndcg3 = ndcg3 / n_question
    # ndcg10 = ndcg10 / n_question
    f3 = f3 / n_question
    # a3 = a3 / n_question
        

    result = {
        'P@1': p1,
        'P@3': p3,
        'R@1': r1,
        'R@3': r3,
        # 'R@10': r10,
        # 'NDCG@3': ndcg3,
        # 'NDCG@10': ndcg10,
        'F1@3': f3,
        # 'A@3': a3
        }
    
    return result

In [68]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## 使用 Trainer.predict 生成出預測的選項之後，再拿來計算分數
#### 之前實驗使用 model.generate 的結果會與使用 trainer.predict 的結果不同

In [69]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

test: 


{'test_loss': 0.6173003315925598,
 'test_P@1': 0.28271279194762045,
 'test_P@3': 0.18860549203556942,
 'test_R@1': 0.09427017166683342,
 'test_R@3': 0.18868692791295957,
 'test_F1@3': 0.1886380663865255,
 'test_runtime': 281.8912,
 'test_samples_per_second': 36.301,
 'test_steps_per_second': 1.135}

In [70]:
print(len(predictions))

10233


In [71]:
print(predictions[0:5])

[[    0     3   834    13  1028 29676    33  2508     6   464     6  6908
      1     0     0     0     0     0     0     0]
 [    0     3   834    13  1028 29676    33  1573     6  1573     6  1573
      1     0     0     0     0     0     0     0]
 [    0     3   834    13  1028 29676    33  1518     6  1518     6  1518
      1     0     0     0     0     0     0     0]
 [    0     3   834    13  1028 29676    33  1381     6  1381     6  1381
      1     0     0     0     0     0     0     0]
 [    0     3   834    13  1028 29676    33   733     6 10802     6   733
      1     0     0     0     0     0     0     0]]


In [72]:
print(len(labels))

10233


In [73]:
print(labels[0:5])

[[    3   834    13  1028 29676    33   464     6     3  6955     6  1631
      1     0     0     0     0     0     0     0     0     0     0]
 [    3   834    13  1028 29676    33 12939     6  2173     6  1690     1
      0     0     0     0     0     0     0     0     0     0     0]
 [    3   834    13  1028 29676    33  3958     6   320     6   580     1
      0     0     0     0     0     0     0     0     0     0     0]
 [    3   834    13  1028 29676    33 19779     6  1513     6  3827    15
     26     1     0     0     0     0     0     0     0     0     0]
 [    3   834    13  1028 29676    33  7177     6  3071     6  5054     1
      0     0     0     0     0     0     0     0     0     0     0]]


## 測試模型在 cloth-f 的表現，正解為題目的 distractor，A@3 為模型生成出三個選項與題目相同答案的機率

In [75]:
sentence_count = 0
spare_num = 10
return_num = 1
model.eval()
answers_prob = 0.0
distractors_prob = 0.0
max_words_prob = 0.0

one_word_prob = 0.0
two_word_prob = 0.0
three_word_prob = 0.0
answer_num = 0

not_related_prob = 0.0
not_related_word = ["fan"]
n_question = len(predictions)

p1 = 0.
p3 = 0.
r1 = 0.
r3 = 0.
r10 = 0.
mrr = 0.
_map = 0.
ndcg3 = 0.
ndcg10 = 0.
f3 = 0.
a3 = 0.

with torch.no_grad():
    for batch in predictions:
        # print(f"batch {batch}")
        answer_list = [test_sample[sentence_count]['answer']]
        distractor_list = test_sample[sentence_count]['distractors']
        
        answer_set = set(answer_list)
        distractor_set = set(distractor_list)
        
        simple_output = tokenizer.decode(batch, skip_special_tokens=True)
        pred_dist = simple_output.split(', ')
        
        pred_dist[0] = pred_dist[0].split(' ')[-1]
        
        p1 += eval_precision(distractor_list, pred_dist, 1)
        p3_score = eval_precision(distractor_list, pred_dist, 3)

        p3 += p3_score
        ndcg3_r = 0.
        mrr_r = eval_mrr(distractor_list, pred_dist)
        if len(pred_dist) > 2:
            
            ndcg3_r = eval_ndcg(distractor_list, pred_dist, 3)
        pred1_set = set(pred_dist[:1])
        pred3_set = set(pred_dist[:3])
        
        r_1 = len(distractor_set & pred1_set) / float(len(distractor_set))
        p_3 = len(distractor_set & pred3_set) / float(3)
        r_3 = len(distractor_set & pred3_set) / float(len(distractor_set))
        a_3 = len(answer_set & pred3_set) / float(1)
        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        # p1+=p_1
        # p3+=p_3
        r1+=r_1
        r3+=r_3
        f3+=f1_3
        a3+=a_3
        mrr+=mrr_r
        ndcg3+=ndcg3_r
        
        sentence_count += 1
        
p1 = p1 / n_question
p3 = p3 / n_question
r1 = r1 / n_question
r3 = r3 / n_question
# r10 = r10 / n_question
ndcg3 = ndcg3 / n_question
# ndcg10 = ndcg10 / n_question
f3 = f3 / n_question
a3 = a3 / n_question
mrr = mrr / n_question

result = {
    'P@1': p1,
    'P@3': p3,
    'R@1': r1,
    'R@3': r3,
    # 'R@10': r10,
    'NDCG@3': ndcg3,
    # 'NDCG@10': ndcg10,
    'MRR': mrr,
    'F1@3': f3,
    'A@3': a3,
    'n_question': n_question,
    'sentence_count': sentence_count
    }
       
print(result)

{'P@1': 0.2828105150004886, 'P@3': 0.1887032150884376, 'R@1': 0.09428645884231145, 'R@3': 0.18873578943939365, 'NDCG@3': 0.3517905868963519, 'MRR': 0.3362650249193793, 'F1@3': 0.18871624482882002, 'A@3': 0.10808169647219779, 'n_question': 10233, 'sentence_count': 10233}
