In [5]:
from pathlib import Path
from rouge import Rouge
from nltk.translate import bleu_score as bleu_scorer
from nltk.translate import meteor_score as nltk_meteor_scorer
import nltk.translate.gleu_score as gleu_scorer
import numpy as np
from tqdm.auto import tqdm
from collections import Counter

In [6]:
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    EncoderDecoderModel,
    AutoTokenizer,
    PegasusTokenizer,
    # PegasusXForConditionalGeneration,
    HfArgumentParser,
    MBartTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
    # MvpTokenizer,
)

# Code for NLU&POL

In [7]:
def remove_sp_tokens(txt):
    # words = txt.strip().split()
    sp_tokens = [
        '[CLS]',
        '[PAD]',
        '[SEP]',
        "[unused",
        '__start__',
        '__null__',
        '__end__',
        '<pad>',
        '</s>',
        '<s>',
        '.',
        '','�','','','',
    ]
   
    
    new_text = txt
    for e in sp_tokens:
        new_text = new_text.replace(e," ")
    
   
        # new_text = new_text.replace(e,f" {e} ")
    new_text = new_text.replace(']',"] ").replace('['," [")
    words = new_text.strip().split()
    words = [e for e in words if e != ""]
    return ' '.join(words)

def remove_puncs(txt):
    puncs = [
        '!', '"', '#', '$', '%', '&', "'", '(', 
        ')', '*', '+', ',', '-', '.', '/', ':',
        ';', '<', '=', '>', '?', '@', r'\\', r'^'
        '`', '{', '|', '}', '~',
    ]
    new_text = txt 
    for e in puncs:
        new_text = new_text.replace(e," ")
    words = new_text.strip().split()
    words = [e for e in words if e != ""]
    return ' '.join(words)

def remove_dup_tokens(txt):
    words = txt.strip().split()
    new_words = []
    for e in words:
        if new_words == [] or e != new_words[-1]:
            new_words.append(e)
    return ' '.join(new_words)

def check_key(t):
    return t.startswith('[') and t.endswith(']')

def process_single(text):
    # test_text = "QAB [poi] Chevron [poi] HR [exit]"
    words = text.split()
    data = []
    for word in words:
        is_key = check_key(word)
        if not is_key and data == []:
            continue
        elif is_key:
            data.append([word])
        else:
            data[-1].append(word)
    data = [e for e in data if len(e)>=2]
    keys = []
    values = []
    kvs = []
    for e in data:
        k = e[0]
        v = ' '.join(e[1:])
        kv = f"{k}-{v}"
        # if kv in kvs:
        #     continue
        keys.append(k)
        values.append(v)
        kvs.append(kv)
    return {
        "data": data,
        "keys": keys,
        "values": values,
        "combined_kv": kvs,
    }


def diff_prediction_source(pred, sour):
    '''
    a = ["a","c","b","v","c","b","b"]
    b = ["a","z","c","b","b","c","c","c"]

    overlap:    ["a","c","b","c","b"]
    a_not_in_b: ["v","b"]
    b_not_in_a: ["z","c","c"]
    '''
    a = pred
    b = sour
    count_a = Counter(a)
    count_b = Counter(b)

    # ab中都有
    both_dict = {}
    for k,t1 in count_a.items():
        t2 = count_b.get(k, 0)
        times = min(t1,t2)
        if times > 0:
            both_dict[k] = times
    both_list = [[k]*v for k,v in both_dict.items()]
    both_list = sum(both_list,[])

    # a有b没有
    a_not_in_b_dict = {}
    for k, t1 in count_a.items():
        t = both_dict.get(k,0)
        diff_t = t1-t
        if diff_t > 0:
            a_not_in_b_dict[k] = diff_t
    a_not_in_b_list = [[k]*v for k,v in a_not_in_b_dict.items()]
    a_not_in_b_list = sum(a_not_in_b_list,[])

    # b有a没有
    b_not_in_a_dict = {}
    for k, t2 in count_b.items():
        t = both_dict.get(k,0)
        diff_t = t2-t
        if diff_t > 0:
            b_not_in_a_dict[k] = diff_t
    b_not_in_a_list = [[k]*v for k,v in b_not_in_a_dict.items()]
    b_not_in_a_list = sum(b_not_in_a_list,[])
    
    return both_list,a_not_in_b_list,b_not_in_a_list

def eval_nlu_and_pol(predictions,targets):
    tp = 0
    fp = 0
    # tn = 0
    fn = 0


    for pred, targ in zip(predictions, targets):
        pred_d = process_single(pred)
        pred_kv = pred_d['combined_kv']

        targ_d = process_single(targ)
        targ_kv = targ_d['combined_kv']
        
        
        if pred_kv == [] and targ_kv == []:
            tp += 1
            continue

#         pred_correct = [e for e in pred_kv if e in targ_kv]
#         pred_wrong = [e for e in pred_kv if e not in targ_kv]

#         targ_correct = [e for e in targ_kv if e in pred_kv]
#         targ_wrong = [e for e in targ_kv if e not in pred_kv]

        pred_correct,pred_wrong,targ_wrong = diff_prediction_source(pred_kv,targ_kv)
        
        tp += len(pred_correct)
        fp += len(pred_wrong)
        fn += len(targ_wrong)
    # acc = correct / (correct + wrong + 1e-10)
    # print(f"ACC: {acc*100:.2f}%")    



    precision = tp / (tp + fp + 1e-10)
    recall = tp / (tp + fn + 1e-10)
    f1 = 2 * precision * recall / (precision + recall + 1e-10)

    # print(f"Precision: {precision*100:.2f}%")
    # print(f"Recall:    {recall*100:.2f}%")
    # print(f"F1:        {f1*100:.2f}%")
    
    return {
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

def eval__nlu_and_pol_by_fname(pred_f,targ_f,remove_sp_token=True,remove_punc=True,lower=True,remove_dup_token=False):
    result_fpath = Path(pred_f).absolute().resolve()
    target_fpath = Path(targ_f).absolute().resolve()
    with result_fpath.open() as f:
        predictions = f.read().strip().splitlines()
    with target_fpath.open() as f:
        targets = f.read().strip().splitlines()
        
    if remove_sp_token:
        predictions = [remove_sp_tokens(e) for e in predictions]
        targets = [remove_sp_tokens(e) for e in targets]
    if remove_punc:
        predictions = [remove_puncs(e) for e in predictions]
        targets = [remove_puncs(e) for e in targets]
    if lower:
        predictions = [e.lower() for e in predictions]
        targets = [e.lower() for e in targets]
    if remove_dup_token:
        predictions = [remove_dup_tokens(e) for e in predictions]
        targets = [remove_dup_tokens(e) for e in targets]
    
    results = eval_nlu_and_pol(predictions,targets)
    return results

def print_results(results, modelname=None):
    p = results['precision']
    r = results['recall']
    f = results['f1']
    if modelname is not None:
        print(f"Model: {modelname}")
    print(f"Precision: {p*100:.2f}")
    print(f"Recall:    {r*100:.2f}")
    print(f"F1:        {f*100:.2f}")
    print()

# Code for NLG

In [8]:
def rouge_score_pip(answers,refs):
    '''
    https://pypi.org/project/rouge/
    '''
    r = Rouge()
    # rouge_result = r.get_scores(["how are you?"],["how are you!"])
    rouge_result = r.get_scores(answers,refs,avg=True)
    return rouge_result['rouge-l']['f']*100

def bleu_n_scores(answers,refs,n=4):
    sm_function = bleu_scorer.SmoothingFunction().method0
    w = 1/n
    weights = [w for _ in range(n)]
    answers_words = [e.split() for e in answers]
    refs_words = [e.split() for e in refs]
    list_of_refs = [[e] for e in refs_words]
    blue_n_score = bleu_scorer.corpus_bleu(
        list_of_refs,
        answers_words,
        weights=weights,

    )
    return blue_n_score*100

def meteor_score_by_nltk(answers,refs):
    meteor_scores = []
    for ans,ref in zip(answers,refs):
        cur_score = nltk_meteor_scorer.meteor_score([ref.split()],ans.split())
        meteor_scores.append(cur_score)
    meteor_score = np.mean(meteor_scores)
    return meteor_score*100

def gleu_score_by_nltk(answers,refs):
    ref_words = [[e.split()] for e in refs]
    ans_words = [e.split() for e in answers]
    gleu_score = gleu_scorer.corpus_gleu(ref_words, ans_words)
    return gleu_score*100

def eval_nlg(predictions,targets):
    prediction_nonempty = []
    target_nonempty = []
    for pred,targ in zip(predictions,targets):
        if pred.strip() == "" or targ.strip() == "":
            continue
        prediction_nonempty.append(pred)
        target_nonempty.append(targ)
    
    bleu4_score = bleu_n_scores(prediction_nonempty,target_nonempty,n=4)
    rouge_score = rouge_score_pip(prediction_nonempty,target_nonempty)
    meteor_score = meteor_score_by_nltk(prediction_nonempty,target_nonempty)
    gleu_score = gleu_score_by_nltk(prediction_nonempty,target_nonempty)
    return {
        "BLEU-4": bleu4_score,
        "ROUGE-L": rouge_score,
        "METEOR": meteor_score,
        "GLEU": gleu_score,
    }

def eval_nlg_by_fname(pred_f,targ_f,remove_sp_token=True,remove_punc=True,lower=True,remove_dup_token=False):
    # remove_dup_token for nothing
    result_fpath = Path(pred_f).absolute().resolve()
    target_fpath = Path(targ_f).absolute().resolve()

    with result_fpath.open() as f:
        predictions = f.read().strip().splitlines()
    

    with target_fpath.open() as f:
        targets = f.read().strip().splitlines()
    
    if remove_sp_token:
        predictions = [remove_sp_tokens(e) for e in predictions]
        targets = [remove_sp_tokens(e) for e in targets]
    if remove_punc:
        predictions = [remove_puncs(e) for e in predictions]
        targets = [remove_puncs(e) for e in targets]
    if lower:
        predictions = [e.lower() for e in predictions]
        targets = [e.lower() for e in targets]
    
    results = eval_nlg(predictions,targets)
    return results


def print_nlg_results(results, modelname=None):
    b = results['BLEU-4']
    r = results['ROUGE-L']
    m = results['METEOR']
    g = results['GLEU']
    if modelname is not None:
        print(f"Model: {modelname}")
    print(f"BLEU-4:  {b:.2f}")
    print(f"ROUGE-L: {r:.2f}")
    print(f"METEOR:  {m:.2f}")
    print(f"GLEU:    {g:.2f}")
    print()

In [38]:
# NLU&POL  我们的模型
model = "gpt2"

mode = "history"
# mode = "profile"
# mode = "history-profile"


task = "NLU"
# task = "POL"


# if_bs = False

# if_ori = False
if_ori = True
ori = "ori-" if if_ori else ""

if_bs = True
bs = "-bs" if if_bs else ""

fpath = Path(f'./inference-results/{ori}{mode}-{model}{bs}/').absolute().resolve()
# if if_bs:
#     fpath = Path(f'./inference-results/{mode}-{model}-bs/').absolute().resolve()
# else:
#     fpath = Path(f'./inference-results/{mode}-{model}/').absolute().resolve()



print(f"mode: {mode}")
print(f"task: {task}\n")
dset = "test"
# dset = "dev"
result_fpath = fpath.joinpath(f"{dset}-{task}.result")
target_fpath = fpath.joinpath(f"{dset}-{task}.target")


# result_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/predictions/NLU-bart-large-test.txt')
# result_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/predictions/NLU-t5-large-test.txt')
# target_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/NLU/test.target')

# result_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/predictions/DST-bart-large-test.txt')
# result_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/predictions/DST-t5-large-val.txt')
# target_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/DST/test.target')

with result_fpath.open() as f:
    predictions = f.read().strip().splitlines()
with target_fpath.open() as f:
    targets = f.read().strip().splitlines()
# assert len(predictions) == len(targets)
results = eval__nlu_and_pol_by_fname(result_fpath,target_fpath,remove_dup_token=False)
print_results(results)

mode: history
task: NLU

Precision: 88.94
Recall:    91.36
F1:        90.13



In [39]:
# NLG 我们的模型
mode = "history"
# mode = "profile"
# mode = "history-profile"
model = "gpt2"

# if_bs = False
if_ori = True
ori = "ori-" if if_ori else ""

if_bs = True
bs = "-bs" if if_bs else ""

fpath = Path(f'./inference-results/{ori}{mode}-{model}{bs}/').absolute().resolve()

task = "NLG"
dset = "test"
# dset = "dev"
result_fpath = fpath.joinpath(f"{dset}-{task}.result")
target_fpath = fpath.joinpath(f"{dset}-{task}.target")


results = eval_nlg_by_fname(result_fpath,target_fpath)
print_nlg_results(results, modelname=str(fpath))

# result_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/predictions/NLG-t5-large-test.txt')
# result_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/predictions/DST-t5-large-val.txt')
# target_fpath = Path('/home/jitianbo/Workspace/driver_simulator_kvret/dialog-task/NLG/test.target')


# with result_fpath.open() as f:
#     predictions = f.read().strip().splitlines()
# with target_fpath.open() as f:
#     targets = f.read().strip().splitlines()

Model: /home/jitianbo/Workspace/driver_simulator_kvret/simulator/ablation/inference/inference-results/ori-history-gpt2-bs
BLEU-4:  27.82
ROUGE-L: 48.67
METEOR:  48.43
GLEU:    27.41

