In [1]:
import nltk
import evaluate
import json
import pickle
import codecs
import networkx as nx
from evaluate import load
import torch
import numpy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cot_tokenizer = AutoTokenizer.from_pretrained("prakharz/DIAL-BART0")
cot_model = AutoModelForSeq2SeqLM.from_pretrained("prakharz/DIAL-BART0")
cot_model.load_state_dict(torch.load('../CoT/topic_extraction/model/topic_er3.pt'))

cot_model = cot_model.cuda()

In [4]:
def generate_cot(text_in, tok_in, mod_in):
    tok_text = tok_in(text_in, return_tensors='pt').to('cuda:0')
    gen_text = mod_in.generate(**tok_text, max_length=60)
    dec_text = tok_in.decode(gen_text[0], skip_special_tokens=True)
    return dec_text

def CoT_to_Preference(cot):
    # (sports,yes)|(football team,yes)
    # "{\"sports\":\"positive\", \"football\":\"positive\"}"
    topics = cot.split('|')
    top_dict = {}
    for top in topics:
        top = top.replace('(', '')
        top = top.replace(')', '')
        the_top, pref = top.split(',')
        # print(pref)
        if pref == 'yes':
            pref = 'positive'
        elif pref == 'no':
            pref = 'negative'
        else:
            pref = 'unknown'
        top_dict[the_top] = pref
    return top_dict

def update_graph(top_pref_prof, g):
    prev_tpxt = []
    for tpxt in top_pref_prof:
        # add node if not in graph, else update it
        if tpxt not in g.nodes:
            g.add_node(tpxt, pref=top_pref_prof[tpxt])
        else:
            g.nodes[tpxt]['pref'] = top_pref_prof[tpxt]
            
        # add all links between nodes in chain if not already existing only if more than 1 node
        if len(top_pref_prof) > 1 and len(prev_tpxt) >= 1:
            for pt in prev_tpxt:
                if (pt.split(',')[0], tpxt.split(',')[0]) not in g.edges:
                    g.add_edge(pt.split(',')[0], tpxt.split(',')[0])
        # prev_tpxt = tpxt
        prev_tpxt.append(tpxt)  

## Generate Extracted Topic File

In [6]:
tcds = json.loads(open('./topical_chat/Topical-Chat-master/conversations/train.json', 'r').read())


save_js = {}

for i, t in enumerate(tcds):
    if i == 50:
        break
    #print(t)
    #print(tcds[t]['content'])

    graph = nx.Graph()
    # agent_1 is user?
    conv_list = []
    utterance = None
    ground_truth = None
    for j, msg in enumerate(tqdm(tcds[t]['content'])):
        is_issue = False
        if msg['agent'] == 'agent_1':
            utterance = msg['message']
            # generate the graph
            try:
                topic_xtract = generate_cot(utterance, cot_tokenizer, cot_model)
                topic_pref_profile = CoT_to_Preference(topic_xtract.strip())
                update_graph(topic_pref_profile, graph)
                focus_topic = list(topic_pref_profile.keys())[-1]
            except:
                is_issue = True

        else:
            ground_truth = msg['message']
        
        #print(msg)
        #print(j)
        if is_issue:
            utterance = 'Nothing'
            ground_truth = 'Nothing'
            focus_topic = 'Nothing'
            pickled = 'Nothing'
            temp = {'utterance':utterance,'ground_truth':ground_truth, 'focus_topic':focus_topic,'graph':pickled}
            conv_list.append(temp)

            utterance = None
            ground_truth = None
        if utterance != None and ground_truth != None:
            # make graph string
            pickled = codecs.encode(pickle.dumps(graph), "base64").decode()
            temp = {'utterance':utterance,'ground_truth':ground_truth, 'focus_topic':focus_topic,'graph':pickled}
            conv_list.append(temp)
            utterance = None
            ground_truth = None
    
    save_js[t] = conv_list

with open('./eval_ds.json', 'w') as fp:
    json.dump(save_js, fp)
# generate a file that is [conversation_id, utterance, ground_truth, graph]
# pickled = codecs.encode(pickle.dumps(graph), "base64").decode()
# unpickled = pickle.loads(codecs.decode(pickled.encode(), "base64"))


100%|██████████| 21/21 [00:08<00:00,  2.57it/s]
100%|██████████| 21/21 [00:08<00:00,  2.60it/s]
100%|██████████| 21/21 [00:08<00:00,  2.39it/s]
100%|██████████| 23/23 [00:08<00:00,  2.60it/s]
100%|██████████| 21/21 [00:08<00:00,  2.62it/s]
100%|██████████| 21/21 [00:08<00:00,  2.55it/s]
100%|██████████| 21/21 [00:08<00:00,  2.47it/s]
100%|██████████| 21/21 [00:08<00:00,  2.42it/s]
100%|██████████| 21/21 [00:09<00:00,  2.29it/s]
100%|██████████| 35/35 [00:13<00:00,  2.53it/s]
100%|██████████| 21/21 [00:10<00:00,  2.03it/s]
100%|██████████| 21/21 [00:08<00:00,  2.59it/s]
100%|██████████| 21/21 [00:07<00:00,  2.75it/s]
100%|██████████| 24/24 [00:09<00:00,  2.47it/s]
100%|██████████| 21/21 [00:08<00:00,  2.51it/s]
100%|██████████| 21/21 [00:08<00:00,  2.59it/s]
100%|██████████| 21/21 [00:07<00:00,  2.64it/s]
100%|██████████| 21/21 [00:07<00:00,  2.82it/s]
100%|██████████| 21/21 [00:08<00:00,  2.57it/s]
100%|██████████| 21/21 [00:07<00:00,  2.70it/s]
100%|██████████| 23/23 [00:09<00:00,  2.

## BLEU Score

In [8]:
ref = ['the','dog','jumped','over','the','moon']
hyp = ['the','dog','jumped','over','the','moon']

BLEUscore = nltk.translate.bleu_score.sentence_bleu([ref], hyp)
print(BLEUscore)

1.0


## ROUGE Score
https://medium.com/mlearning-ai/text-summarization-84ada711c49c

Rouge1 = unigram overlap

Rouge2 = bigram overalap

RougeL = Longest Common Subsequence (LCS)

(RougeW ?)

In [25]:
rouge = evaluate.load('rouge')
candidates = ["Summarization is cool"]

references = [["Summarization is very cool"]]
results = rouge.compute(predictions=candidates, references=references)
print(results)

{'rouge1': 0.8571428571428571, 'rouge2': 0.4, 'rougeL': 0.8571428571428571, 'rougeLsum': 0.8571428571428571}


## UniEval

In [2]:
# Load model directly
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("MingZhong/unieval-dialog")
# model = AutoModelForSeq2SeqLM.from_pretrained("MingZhong/unieval-dialog")

In [12]:
from UniEval.UniEval.utils import convert_to_json
from UniEval.UniEval.metric.evaluator import get_evaluator

task = 'dialogue'

# a list of dialogue histories
src_list = ['hi , do you know much about the internet ? \n i know a lot about different sites and some website design , how about you ? \n\n']
# a list of additional context that should be included into the generated response
context_list = ['']#['the 3 horizontal line menu on apps and websites is called a hamburger button .\n']
# a list of model outputs to be evaluated
output_list = ['i like pizza.']# ['i do too . did you know the 3 horizontal line menu on apps and websites is called the hamburger button ?']

# Prepare data for pre-trained evaluators
data = convert_to_json(output_list=output_list, 
                       src_list=src_list, context_list=context_list)

# Initialize evaluator for a specific task
evaluator = get_evaluator(task)
# Get multi-dimensional evaluation scores
eval_scores = evaluator.evaluate(data, print_result=True)

Evaluating naturalness of 1 samples !!!


100%|██████████| 1/1 [00:00<00:00, 21.62it/s]


Evaluating coherence of 1 samples !!!


100%|██████████| 1/1 [00:00<00:00, 22.14it/s]


Evaluating engagingness of 1 samples !!!


100%|██████████| 1/1 [00:00<00:00, 21.89it/s]


Evaluating groundedness of 1 samples !!!


100%|██████████| 1/1 [00:00<00:00, 22.40it/s]


Evaluating understandability of 1 samples !!!


100%|██████████| 1/1 [00:00<00:00, 22.27it/s]


Evaluation scores are shown below:
+-------------------+----------+
|     Dimensions    |  Score   |
+-------------------+----------+
|    naturalness    | 0.510111 |
|     coherence     | 0.000146 |
|    engagingness   | 0.00046  |
|    groundedness   | 0.430141 |
| understandability | 0.489637 |
|      overall      | 0.286099 |
+-------------------+----------+





## Run evaluation

In [None]:
import openai 
  
openai.my_api_key = open('./api_key.txt', 'r').read()

In [14]:
def generate_recommendation(text_in, tok_in, mod_in):
    tok_text = tok_in(text_in, return_tensors='pt').to('cuda:0')
    gen_text = mod_in.generate(**tok_text, max_new_tokens=32)
    dec_text = tok_in.decode(gen_text[0], skip_special_tokens=True)
    return dec_text

def generate_response(text_in, guideline, tok_in, mod_in):
    blend_in_str = text_in + ' [GUIDELINE] ' + guideline
    blend_in_ids = tok_in([blend_in_str], max_length=512, return_tensors='pt', truncation=True)
    blend_example = mod_in.generate(**blend_in_ids, max_length=60)
    response = tok_in.batch_decode(blend_example, skip_special_tokens=True)[0]
    return response

In [8]:
blen_tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
blen_model = AutoModelForSeq2SeqLM.from_pretrained("TrevorAshby/blenderbot-400M-distill")
blen_model = blen_model.cuda()

recommender_tokenizer = AutoTokenizer.from_pretrained("t5-large")
recommender_model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
recommender_model.load_state_dict(torch.load('../CoT/recommender/model/rec_er.pt'))
recommender_model = recommender_model.cuda()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [30]:
rouge = evaluate.load('rouge')

eval_tcds = json.loads(open('./eval_ds.json', 'r').read())

save_js = {}

for i, t in enumerate(eval_tcds):
    if i == 10:
        break
    conv_list = []
    for j, inst in enumerate(tqdm(eval_tcds[t])):
        #print(eval_tcds[t])
        user_in = inst['utterance']
        real_response = inst['ground_truth']
        # unpickle
        pickled = inst['graph']
        focus_topic = inst['focus_topic']
        unpickled = pickle.loads(codecs.decode(pickled.encode(), "base64"))

        xtract_prof = {}
        xtract_prof[focus_topic] = unpickled.nodes[focus_topic]['pref']
        for x_nodes in unpickled.edges([focus_topic]):
            xn = x_nodes[1]
            xtract_prof[xn] = unpickled.nodes[xn]['pref']

        num_sugg = 3
        prompt = f"Instruction: Generate only {num_sugg} similar topics that could be suggested for new conversation that takes influence from but are not present in the following user profile: {xtract_prof} In the generated answer, generate each of the suggested topics separated by a comma like so: TOPIC1,TOPIC2,TOPIC3,TOPIC4,etc.\nSuggested Topics:"
        topic_recs = generate_recommendation(prompt, recommender_tokenizer, recommender_model).split(',')

        # template guideline generation
        if xtract_prof[focus_topic] == 'positive':
            tpref = 'The user likes'
        elif xtract_prof[focus_topic] == 'negative':
            tpref = 'The user dislikes'
        else:
            tpref = 'It is unclear if the user likes or dislikes'

        guideline = f'{tpref} {focus_topic}. Direct the conversation to one of the following 3 topics: {topic_recs}.'

        # response generate

        blend_in_ids = blen_tokenizer([f'{user_in} [GUIDELINE] {guideline}'], max_length=128, return_tensors='pt', truncation=True).to('cuda:0')
        blend_example = blen_model.generate(**blend_in_ids, max_length=60)
        blend_response = blen_tokenizer.batch_decode(blend_example, skip_special_tokens=True)[0]


        # GENERATE BLEU SCORE
        # split all words
        our_bleu = nltk.translate.bleu_score.sentence_bleu([blend_response], real_response)
        
        # GENERATE ROUGE SCORE
        #print(blend_response.strip())
        #print(real_response)
        our_rouge = rouge.compute(predictions=[blend_response], references=[[real_response]])
        #print(our_rouge)

        # GENERATE CHATGPT SCORE
        messages = [ {"role": "system", "content": "You are a intelligent assistant."} ]
        message = blend_response
        if message: 
            prompt = f'You job is to rank on a scale of 1-5 how well utterance B responds to utterance A:
            A: "{user_in}"
            B: "{message}"'
            messages.append( 
                {"role": "user", "content": prompt}, 
            ) 
            chat = openai.ChatCompletion.create( 
                model="gpt-3.5-turbo", messages=messages 
            ) 
        reply = chat.choices[0].message.content 
        print(f"ChatGPT: {reply}") 
        # messages.append({"role": "assistant", "content": reply}) 
        
        ours = {'generated':blend_response, 
                'bleu':our_bleu, 
                'rouge':our_rouge, 
                'guideline':guideline,
                'suggested_topics':topic_recs,
                'focus_topic':focus_topic,
                'pref_prof':xtract_prof} 
        temp = {'user_in':user_in, 'ours':ours, 'target_response':real_response}
        conv_list.append(temp)
        # save [input, our output, bleu, rouge, output baselines, bleu, rouge, GPT-4 ranking]
    save_js[t] = conv_list

with open('./eval_final.json', 'w') as fp:
    json.dump(save_js, fp)

100%|██████████| 10/10 [00:30<00:00,  3.09s/it]
100%|██████████| 10/10 [00:32<00:00,  3.30s/it]
100%|██████████| 10/10 [00:32<00:00,  3.24s/it]
100%|██████████| 11/11 [00:35<00:00,  3.22s/it]
100%|██████████| 10/10 [00:31<00:00,  3.18s/it]
100%|██████████| 10/10 [00:31<00:00,  3.13s/it]
100%|██████████| 10/10 [00:32<00:00,  3.23s/it]
100%|██████████| 10/10 [00:30<00:00,  3.03s/it]
100%|██████████| 10/10 [00:32<00:00,  3.30s/it]
100%|██████████| 17/17 [00:54<00:00,  3.21s/it]
