# Setup

In [1]:
verbose = True
do_metric_training = False

In [2]:
### Run environment setup
import os
import lib.BBSetup as BBSetup

try:
    from google.colab import drive
    BBSetup.colab_setup(mount_folder=r"/content/drive/My Drive/unibo/NLP_project/BarneyBot")
except:
    try:
        BBSetup.anaconda_manual_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot",
                                      env_name="barneybot")
    except:
        BBSetup.anaconda_auto_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot")

### Define folders
base_folder = BBSetup.BASE_FOLDER
in_folder = BBSetup.set_folder(os.path.join(base_folder, 'Data', 'Characters'))
out_folder = BBSetup.set_folder(os.path.join(base_folder, 'Metrics', 'New'))

pip install -r "E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\requirements.txt"


In [3]:
### load_char_df() (hg dataset) ['test'] to get testset, containing contexts and response
### get_chatbot_predictions() to get a type of predictions for a model
from lib.BBDataLoad import load_char_df, get_chatbot_predictions, dialogpt_preprocess_function
from datasets import load_dataset
from transformers import TFAutoModelForCausalLM
from lib.BBMetrics import BBMetric
from lib.BBMetricResults import *

from lib.BBData import character_dict, model_name, random_state
characters = list(character_dict.keys())
characters.remove('Default')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Valerio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Import structures from HuggingFace
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

In [5]:
if do_metric_training:
    print("Training metrics")
    # Neural Chatbot Classifier
    for char in tqdm(characters):
        neural_classifier = BBMetric.load_metric("neural chatbot classifier")
        neural_classifier.train(character=char, random_state=random_state,
                 source_encoded_path=None,
                 source_path=os.path.join(base_folder, "Data", "Sources",
                                          character_dict[char]['source'],
                                          character_dict[char]['source'] + ".csv"),
                 source_save_path=os.path.join(base_folder, "Data", "Characters", char),
                 save_path=os.path.join(base_folder, "Data", "Characters", char))
    # Distilbert-Embedded Chatbot Classifier
    bertembedded_classifier = BBMetric.load_metric("distilbert-embedded chatbot classifier")
    raise NotImplementedError("Will be soon!")
else:
    print("Skipping Metrics training.")

Skipping Metrics training.


# Cache System Creation

In [6]:
# Define a cache structure to avoid reloading stuff
from types import SimpleNamespace

cache = {
    'dialogpt': {char: None for char in characters + ["Base"]},
    'tokenizer': None,
    'datacollator': None,
    'trained_metric': {
        'neural chatbot classifier': {char: None for char in characters},
        'frequency chatbot classifier': None,
        'distilbert-embedded chatbot classifier': None
    },
    'testset': {char + "_df": None for char in characters + ["Common"]},
    'concat_and_encoded_testset': {char + "_df": None for char in characters + ["Common"]},
    'predictions': {
        char + "_df": { # Dataset
            char: { # Chatbot
                'greedy': None,
                'nbeams': None,
                'sampling': None
            } for char in characters + ["Base"]
        } for char in characters + ["Common"]
    },
}
cache = SimpleNamespace(**cache)

def load_cache_entry(value, entry):
    pointer = cache
    for i in range(len(entry)-1):
        val = entry[i]
        if isinstance(pointer, dict):
            pointer = pointer[val]
        elif isinstance(pointer, SimpleNamespace):
            pointer = pointer.__dict__[val]
        else:
            raise Exception()
    if not pointer[entry[-1]]:
        pointer[entry[-1]] = value
        if verbose:
            print("Loaded cache at " + str(entry))
    return pointer[entry[-1]]

def flush_cache_entries(entries):
    for entry in entries:
        pointer = cache
        for i in range(len(entry)-1):
            val = entry[i]
            if isinstance(pointer, dict):
                pointer = pointer[val]
            elif isinstance(pointer, SimpleNamespace):
                pointer = pointer.__dict__[val]
            else:
                raise Exception()
        pointer[entry[-1]] = None
        if verbose:
            print("Flushed cache at " + str(entry))

In [7]:
def get_cache_testset(character, base_folder):
    if not cache.testset[character + "_df"]:
        if character != "Common":
            df = load_char_df(character, base_folder)['test']
        else: 
            df = load_dataset('csv',
                     data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                     cache_dir=os.path.join(base_folder, "cache"))['train']
        load_cache_entry(df, ['testset', character + "_df"])
    return cache.testset[character + "_df"]

# For perplexity
def get_cache_concat_and_encoded_testset(character, base_folder):
    if not cache.concat_and_encoded_testset[character + "_df"]:
        testset = get_cache_testset(character, base_folder)
        concat_encoded_testset = testset.map(lambda row: dialogpt_preprocess_function(row,
                                                                            cache.tokenizer),
                                             batched=False)
        concat_encoded_testset = concat_encoded_testset.to_tf_dataset(
            columns=["input_ids", "attention_mask", "labels"],
            shuffle=False,
            batch_size=8,
            collate_fn=cache.datacollator,
        )
        load_cache_entry(concat_encoded_testset, ['concat_and_encoded_testset', character + "_df"])
    return cache.concat_and_encoded_testset[character + "_df"]

def get_cache_predictions(dataset_from, character, base_folder, gen_type):
    if not cache.predictions[dataset_from][character][gen_type]:
        if dataset_from == character + "_df":
            predictions_tk = get_chatbot_predictions(None, None,
                  character_dict[character]['prediction_filename'] + '_' + gen_type + '.json',
                  None, character, None, base_folder, override_predictions=False)
        elif character == "Base":
            predictions_tk = get_chatbot_predictions(None, None,
                  'from_' + dataset_from + '__' + gen_type + '.json',
                  None, 'Default', None, base_folder, override_predictions=False)
        elif dataset_from == "Common_df" and character != "Base":
            df = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))
            df = df.remove_columns(['source'])
            model = get_cache_model(character)
            predictions_tk = get_chatbot_predictions(df['train']['context/0'], model,
                  "", gen_type, character, cache.tokenizer, base_folder, file_caching=False, override_predictions=False)            
        else:
            raise NotImplementedError("Unexpected predictions to load!")
        predictions = []
        for line in predictions_tk:
            predictions.append(cache.tokenizer.decode(line, skip_special_tokens=True))
        load_cache_entry(predictions, ['predictions', dataset_from, character, gen_type])
    return cache.predictions[dataset_from][character][gen_type]

# For metrics worth caching, in particular the chatbot classifiers
def get_cache_metric(metric_name, **kwargs):
    classifier_char = None if 'classifier_char' not in kwargs else kwargs['classifier_char']
    mode = 'c-tf-idf' if 'mode' not in kwargs else kwargs['mode']
    if metric_name in cache.trained_metric:
        if metric_name == "neural chatbot classifier":
            if not cache.trained_metric[metric_name][classifier_char]:
                cache.trained_metric[metric_name][classifier_char] = BBMetric.load_metric(metric_name)
                cache.trained_metric[metric_name][classifier_char].compute( # Dummy round for caching
                    character=classifier_char,
                    load_path=os.path.join(base_folder, "Data", "Characters",
                              classifier_char, character_dict[classifier_char]['classifier_folder']),
                    sentences=["Hi", "Hello", "How"])
            return cache.trained_metric[metric_name][classifier_char]
        elif metric_name == "frequency chatbot classifier":
            if not cache.trained_metric[metric_name]:
                cache.trained_metric[metric_name] = BBMetric.load_metric(metric_name)
                cache.trained_metric[metric_name].train(
                    characters_path=os.path.join(base_folder, "Data", "Characters"),
                    mode=mode)
            return cache.trained_metric[metric_name]
        elif metric_name == "distilbert-embedded chatbot classifier":
                raise NotImplementedError("Will be soon!")
    else:
        return BBMetric.load_metric(metric_name)

def get_cache_model(character):
    if character == "Base":
        model = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
    else:
        checkpoint_folder = os.path.join(in_folder, character, character_dict[character]['checkpoint_folder'])
        model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
    model.compile()
    cache.dialogpt[character] = model
    return cache.dialogpt[character]

cache.tokenizer = tokenizer
cache.datacollator = data_collator

# Evaluation Process Definition

In [8]:
def sentence_callable(reference_set, character, column):
    if column == "context/0" or column == "response":
        assert(reference_set == character + "_df")
        return get_cache_testset(character, base_folder)[column]
    else:
        assert(reference_set == character + "_df" or \
               reference_set == "Common_df" or \
               (character == "Base" and column == "sampling"))
        return get_cache_predictions(reference_set, character, base_folder, column)

def perplexity_callable(reference_set, character):
    return {
        'model': get_cache_model(character),
        'encoded_test_set': get_cache_concat_and_encoded_testset(reference_set.replace("_df", ""),
                                                                 base_folder)
    }

In [9]:
def evaluate_round(queries):
    actors_pprint_map = {
        MetricActor.DATASET_CHAR: "dataset",
        MetricActor.DATASET_CHARCONTEXT: "dataset labels",
        MetricActor.DIALOGPT_GREEDY: "dialogpt (greedy)",
        MetricActor.DIALOGPT_NBEAMS: "dialogpt (nbeamns)",
        MetricActor.DIALOGPT_SAMPLE: "dialogpt (sampling)"
    }
    actor_to_column_map = {
        MetricActor.DATASET_CHARCONTEXT: 'context/0',
        MetricActor.DATASET_CHAR: 'response',
        MetricActor.DIALOGPT_GREEDY: 'greedy',
        MetricActor.DIALOGPT_NBEAMS: 'nbeams',
        MetricActor.DIALOGPT_SAMPLE: 'sampling'
    }
    results = dict()
    for i in range(len(queries)):
        query = queries[i].copy() # Since there are destructive operations
        print("#### Running Query " + str(i+1) + "/" + str(len(queries)) + " ####")
        if 'run' in query:
            query['run'](**query['run_args'])
        else:
            print("Evaluating " + query['metric_name'] + \
                  " on reference set " + query['reference_set'] + " with:")
            for actor_type, actor in query['metric_actors'].items():
                print("\t" + actor[1] + " " + actors_pprint_map[actor[0]] + " as " + actor_type)
            # Get metric metadata data for outputting
            query_output = dict()
            query_output['metric_name'] = query['metric_name']
            query_output['metric_version'] = 1 # It's 1 for all metrics we use, anyway
            query_output['metric_attempt'] = 0 if 'metric_attempt' not in query \
                                               else query['metric_attempt']
            query_output['metric_actors'] = query['metric_actors']
            query_output['metric_params'] = query['metric_params']
            query_output['context'] = {
                "dialogpt_size": "small",
                "dialogpt_context_sentences": 5,
                "dialogpt_nbeams_beams": 3,
                "dialogpt_sample_top_p": 0.92,
                "dialogpt_sample_top_k": 50
            }
            query_output['metric_arity'] = get_metric_arity(query['metric_name'])
            query_output['metric_determinism'] = get_metric_determinism(query['metric_name'],
                                                                        query_output['metric_version'])
            query_output['reference_set'] = query['reference_set']
            query_hash = dict_hash({'metric_name': query_output['metric_name'],
                                    'metric_version': query_output['metric_version'],
                                    'reference_set': query_output['reference_set'],
                                    'metric_attempt': query_output['metric_attempt'],
                                    'metric_actors': query_output['metric_actors'],
                                    'context': query_output['context'],
                                    'metric_params': query_output['metric_params']})
            for key in query['metric_actors'].keys(): # Lazy fix for "_df" suffix
                if query['metric_actors'][key][0] == MetricActor.DATASET_CHARCONTEXT or \
                    query['metric_actors'][key][0] == MetricActor.DATASET_CHAR:
                    query['metric_actors'][key] = (query['metric_actors'][key][0],
                                                   query['metric_actors'][key][1].replace("_df", ""))
            # Compute the actual metric
            if query['metric_name'] in ['google bleu', 'meteor', 'rouge l', 'mpnet embedding similarity',
                            'emotion classifier', 'distinct', 'roberta crossencoding similarity',
                            'repetitiveness', 'term error rate', 'bertscore', 'bleurt', 'bartscore',
                            'word mover distance', 't5 grammar correction edit distance',
                            'extended edit distance']:
                args_map = {
                    'predictor': 'predictions', 'reference': 'references', 'document': 'sentences',
                    'document0': 'sentences_a', 'document1': 'sentences_b'
                }
                metric = get_cache_metric(query['metric_name'])
                args_dict = {}
                for actor_key, actor_pair in query['metric_actors'].items():
                    args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                       actor_pair[1],
                                                                       actor_to_column_map[actor_pair[0]])
            elif query['metric_name'] == 'comet':
                args_map = {
                    'predictor': 'predictions', 'reference': 'references', 'document': 'sources'
                }
                metric = get_cache_metric(query['metric_name'])
                args_dict = {}
                for actor_key, actor_pair in query['metric_actors'].items():
                    args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                       actor_pair[1],
                                                                       actor_to_column_map[actor_pair[0]])
            elif query['metric_name'] in ['perplexity']:
                actor_pair = list(query['metric_actors'].values())[0]
                metric = get_cache_metric(query['metric_name'])
                args_dict = perplexity_callable(query['reference_set'],
                                                actor_pair[1])
            elif query['metric_name'] in ['frequency chatbot classifier',
                                          'distilbert-embedded chatbot classifier']:
                actor_pair = list(query['metric_actors'].values())[0]
                metric = get_cache_metric(query['metric_name'])
                args_dict = {
                    'sentences': sentence_callable(query['reference_set'],
                                                   actor_pair[1],
                                                   actor_to_column_map[actor_pair[0]])
                }
            elif query['metric_name'] in ['neural chatbot classifier']:
                actor_pair = list(query['metric_actors'].values())[0]
                classifier_char = query['metric_params']['classifier_char']
                args_dict = {
                    'character': classifier_char,
                    'load_path': os.path.join(base_folder, "Data", "Characters",
                                  classifier_char, character_dict[classifier_char]['classifier_folder']),
                }
                metric = get_cache_metric(query['metric_name'],
                                          classifier_char=classifier_char)
                del query['metric_params']['classifier_char']
                args_dict['sentences'] = sentence_callable(query['reference_set'],
                                                           actor_pair[1],
                                                           actor_to_column_map[actor_pair[0]])         
            query_output['answer'] = metric.compute(**{**args_dict, **query['metric_params']})
            results[query_hash] = query_output
                
        print()
    print("Done.")
    return results

# Run Evaluations

In [10]:
# Metric Name: See BBMetric.metrics_list
# Metric Params: See optional and require params of each metric
## NOTE: For neural chatbot classifier, add 'classifier_char' as a parameter
# Metric Actors:
## DATASET_CHARCONTEXT: (any character | "Common") + "_df"
## DATASET_CHAR: (any character | "Common") + "_df"
## DIALOGPT_GREEDY: any character | "Base"
## DIALOGPT_NBEAMS: any character | "Base"
## DIALOGPT_SAMPLE: any character | "Base"
# Reference Set: (any character | "Common") + "_df"
# Metric Attempt: Defaults to 0, add a number to save multiple runs of the same query

In [60]:
queries = [
    {
        'metric_name': 'bleurt',
        'metric_actors': {
            'predictor': (MetricActor.DATASET_CHAR, 'Vader_df'),
            'reference': (MetricActor.DATASET_CHARCONTEXT, 'Vader_df'),
        },
        'reference_set': 'Vader_df',
        'metric_params': {},
        'metric_attempt': 0
    }
]

In [61]:
evaluate_round(queries)

#### Running Query 1/1 ####
Evaluating bleurt (on reference set Vader_df) with:
	Vader_df dataset as predictor
	Vader_df dataset labels as reference


Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint C:\Users\Valerio\.cache\huggingface\metrics\bleurt\default\downloads\extracted\3d33e07d20dc36fda3d97eef6258f85c53f0aaa9906a4530ff316c246d91a357\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.

Done.


{'9901c208f2f724ef76bc34e0c39bfaad': {'metric_name': 'bleurt',
  'metric_version': 1,
  'metric_attempt': 0,
  'metric_actors': {'predictor': (<MetricActor.DATASET_CHAR: 1>, 'Vader'),
   'reference': (<MetricActor.DATASET_CHARCONTEXT: 0>, 'Vader')},
  'metric_params': {},
  'context': {'dialogpt_size': 'small',
   'dialogpt_context_sentences': 5,
   'dialogpt_nbeams_beams': 3,
   'dialogpt_sample_top_p': 0.92,
   'dialogpt_sample_top_k': 50},
  'metric_arity': <MetricArity.PAIRWISE: 2>,
  'metric_determinism': <MetricDeterminism.NEURAL: 2>,
  'reference_set': 'Vader_df',
  'answer': {'score': -1.4269872084259987, 'std': 0.29314456560953456}}}