# Setup

In [1]:
verbose = True
use_cuda = False

do_metric_training = False
do_predictions = False

In [2]:
### Run environment setup
import os
import lib.BBSetup as BBSetup

try:
    from google.colab import drive
    BBSetup.colab_setup(mount_folder=r"/content/drive/My Drive/unibo/NLP_project/BarneyBot")
except:
    try:
        BBSetup.anaconda_manual_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot",
                                      env_name="barneybot")
    except:
        BBSetup.anaconda_auto_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot")

### Define folders
base_folder = BBSetup.BASE_FOLDER
in_folder = BBSetup.set_folder(os.path.join(base_folder, 'Data', 'Characters'))
out_folder = BBSetup.set_folder(os.path.join(base_folder, 'Metrics', 'New'))

pip install -r "E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\requirements.txt"


In [3]:
### load_char_df() (hg dataset) ['test'] to get testset, containing contexts and response
### get_chatbot_predictions() to get a type of predictions for a model
from lib.BBDataLoad import load_char_df, get_chatbot_predictions, dialogpt_preprocess_function
from datasets import load_dataset
from transformers import TFAutoModelForCausalLM
from lib.BBMetrics import BBMetric
from lib.BBMetricResults import *
from tqdm import tqdm

from lib.BBData import character_dict, model_name, random_state
characters = list(character_dict.keys())
characters.remove('Default')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  "class": algorithms.Blowfish,
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tonel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Import structures from HuggingFace
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

In [5]:
def flatten(S):
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

In [6]:
if do_predictions:
    print("Saving predictions to file")
    with tqdm(total=len(characters)*4) as pbar:
        # Chatbot of a character on their own dataset
        for char in characters:
            checkpoint_folder = os.path.join(in_folder, char,
                                             character_dict[char]['checkpoint_folder'])
            model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
            model.compile()
            samples = load_char_df(char)
            for gen_type in ['greedy', 'nbeams', 'sampling']:
                get_chatbot_predictions(samples['test']['context/0'], model,
                              character_dict[char]['prediction_filename'] + '_' + gen_type + '.json',
                              gen_type, char, cache.tokenizer, base_folder, override_predictions=True)
                pbar.update(1)
        # Base chatbot on each character's dataset
        for char in characters:
            model = TFAutoModelForCausalLM.from_pretrained(model_name,
                                                           cache_dir=os.path.join(base_folder, "cache"))
            model.compile()
            samples = load_char_df(char)
            get_chatbot_predictions(samples['test']['context/0'], model,
                              'from_' + char + "_df__sampling.json", gen_type,
                              "Default", cache.tokenizer, base_folder, override_predictions=True)
            pbar.update(1)

In [7]:
if do_metric_training:
    print("Training metrics")
    # Neural Chatbot Classifier
    with tqdm(total=len(characters) + 2) as pbar:
        for char in tqdm(characters):
            neural_classifier = BBMetric.load_metric("neural chatbot classifier")
            neural_classifier.train(character=char, random_state=random_state,
                     source_encoded_path=None,
                     source_path=os.path.join(base_folder, "Data", "Sources",
                                              character_dict[char]['source'],
                                              character_dict[char]['source'] + ".csv"),
                     source_save_path=os.path.join(base_folder, "Data", "Characters", char),
                     save_path=os.path.join(base_folder, "Data", "Characters", char))
            pbar.update(1)
        # Distilbert-Embedded Chatbot Classifier
        bertembedded_classifier = BBMetric.load_metric("distilbert-embedded chatbot classifier")
        bertembedded_classifier.train(characters_path=os.path.join(base_folder, "Data", "Characters"),
                                      save_path=os.path.join(base_folder, "Data", "Metrics", 
                                                             "distilbert_embedder"),
                                      train_embedder=True,
                                      verbose=True)
        pbar.update(1)
        characters_no_barney = characters.copy()
        characters_no_barney.remove("Barney")
        bertembedded_classifier = BBMetric.load_metric("distilbert-embedded chatbot classifier")
        bertembedded_classifier.metric.set_characters(characters_no_barney)
        bertembedded_classifier.train(characters_path=os.path.join(base_folder, "Data", "Characters"),
                                      save_path=os.path.join(base_folder, "Data", "Metrics", 
                                                             "distilbert_embedder_nobarney"),
                                      train_embedder=True,
                                      verbose=True)
        pbar.update(1)
else:
    print("Skipping Metrics training.")

Skipping Metrics training.


# Cache System Creation

In [8]:
# Define a cache structure to avoid reloading stuff
from types import SimpleNamespace

cache = {
    'dialogpt': {char: None for char in characters + ["Base"]},
    'tokenizer': None,
    'datacollator': None,
    'trained_metric': {
        'neural chatbot classifier': {char: None for char in characters},
        'frequency chatbot classifier': None,
        'distilbert-embedded chatbot classifier': {'Full': None, 'No Barney': None}
    },
    'testset': {char + "_df": None for char in characters + ["Common"]},
    'concat_and_encoded_testset': {char + "_df": None for char in characters + ["Common"]},
    'predictions': {
        char + "_df": { # Dataset
            char: { # Chatbot
                'greedy': None,
                'nbeams': None,
                'sampling': None
            } for char in characters + ["Base"]
        } for char in characters + ["Common"]
    },
}
cache = SimpleNamespace(**cache)

def load_cache_entry(value, entry):
    pointer = cache
    for i in range(len(entry)-1):
        val = entry[i]
        if isinstance(pointer, dict):
            pointer = pointer[val]
        elif isinstance(pointer, SimpleNamespace):
            pointer = pointer.__dict__[val]
        else:
            raise Exception()
    if not pointer[entry[-1]]:
        pointer[entry[-1]] = value
        if verbose:
            print("Loaded cache at " + str(entry))
    return pointer[entry[-1]]

def flush_cache_entries(entries):
    for entry in entries:
        pointer = cache
        for i in range(len(entry)-1):
            val = entry[i]
            if isinstance(pointer, dict):
                pointer = pointer[val]
            elif isinstance(pointer, SimpleNamespace):
                pointer = pointer.__dict__[val]
            else:
                raise Exception()
        pointer[entry[-1]] = None
        if verbose:
            print("Flushed cache at " + str(entry))

In [20]:
def get_cache_testset(character, base_folder):
    if not cache.testset[character + "_df"]:
        if character != "Common":
            df = load_char_df(character, base_folder)['test']
        else: 
            df = load_dataset('csv',
                     data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                     cache_dir=os.path.join(base_folder, "cache"))['train']
        load_cache_entry(df, ['testset', character + "_df"])
    return cache.testset[character + "_df"]

# For perplexity
def get_cache_concat_and_encoded_testset(character, base_folder):
    if not cache.concat_and_encoded_testset[character + "_df"]:
        testset = get_cache_testset(character, base_folder)
        concat_encoded_testset = testset.map(lambda row: dialogpt_preprocess_function(row,
                                                                            cache.tokenizer),
                                             batched=False)
        concat_encoded_testset = concat_encoded_testset.to_tf_dataset(
            columns=["input_ids", "attention_mask", "labels"],
            shuffle=False,
            batch_size=8,
            collate_fn=cache.datacollator,
        )
        load_cache_entry(concat_encoded_testset, ['concat_and_encoded_testset', character + "_df"])
    return cache.concat_and_encoded_testset[character + "_df"]

def get_cache_predictions(dataset_from, character, base_folder, gen_type):
    if not cache.predictions[dataset_from][character][gen_type]:
        if dataset_from == character + "_df":
            predictions_tk = get_chatbot_predictions(None, None,
                  character_dict[character]['prediction_filename'] + '_' + gen_type + '.json',
                  None, character, None, base_folder, override_predictions=False)
        elif character == "Base":
            predictions_tk = get_chatbot_predictions(None, None,
                  'from_' + dataset_from + '__' + gen_type + '.json',
                  None, 'Default', None, base_folder, override_predictions=False)
        elif dataset_from == "Common_df" and character != "Base":
            df = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))
            df = df.remove_columns(['source'])
            model = get_cache_model(character)
            predictions_tk = get_chatbot_predictions(df['train']['context/0'], model,
                  "", gen_type, character, cache.tokenizer, base_folder, file_caching=False, override_predictions=False)            
        else:
            raise NotImplementedError("Unexpected predictions to load!")
        predictions = []
        for line in predictions_tk:
            predictions.append(cache.tokenizer.decode(line, skip_special_tokens=True))
        load_cache_entry(predictions, ['predictions', dataset_from, character, gen_type])
    return cache.predictions[dataset_from][character][gen_type]

# For metrics worth caching, in particular the chatbot classifiers
def get_cache_metric(metric_name, **kwargs):
    classifier_char = None if 'classifier_char' not in kwargs else kwargs['classifier_char']
    mode = None if 'mode' not in kwargs else kwargs['mode']
    with_barney = None if 'with_barney' not in kwargs else kwargs['with_barney']
    if metric_name in cache.trained_metric:
        if metric_name == "neural chatbot classifier":
            if not cache.trained_metric[metric_name][classifier_char]:
                cache.trained_metric[metric_name][classifier_char] = BBMetric.load_metric(metric_name)
                cache.trained_metric[metric_name][classifier_char].compute( # Dummy round for caching
                    character=classifier_char,
                    load_path=os.path.join(base_folder, "Data", "Characters",
                              classifier_char, character_dict[classifier_char]['classifier_folder']),
                    sentences=["Hi", "Hello", "How"])
            return cache.trained_metric[metric_name][classifier_char]
        elif metric_name == "frequency chatbot classifier":
            if not cache.trained_metric[metric_name]:
                cache.trained_metric[metric_name] = BBMetric.load_metric(metric_name)
                cache.trained_metric[metric_name].train(
                    characters_path=os.path.join(base_folder, "Data", "Characters"),
                    mode=mode)
            return cache.trained_metric[metric_name]
        elif metric_name == "distilbert-embedded chatbot classifier":
            if not cache.trained_metric[metric_name][with_barney]:
                if with_barney == "Full":
                    cache.trained_metric[metric_name][with_barney] = BBMetric.load_metric(metric_name,
                                embedder_path=os.path.join(base_folder, "Data", "Metrics", 
                                                           "distilbert_embedder"),
                                from_pretrained=True, use_cuda=use_cuda)
                elif with_barney == "No Barney":
                    cache.trained_metric[metric_name][with_barney] = BBMetric.load_metric(metric_name,
                                embedder_path=os.path.join(base_folder, "Data", "Metrics", 
                                                           "distilbert_embedder"),
                                from_pretrained=True, use_cuda=use_cuda)
    else:
        return BBMetric.load_metric(metric_name)

def get_cache_model(character):
    if character == "Base":
        model = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
    else:
        checkpoint_folder = os.path.join(in_folder, character, character_dict[character]['checkpoint_folder'])
        model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
    model.compile()
    cache.dialogpt[character] = model
    return cache.dialogpt[character]

cache.tokenizer = tokenizer
cache.datacollator = data_collator

# Evaluation Process Definition

In [10]:
def sentence_callable(reference_set, character, column):
    if column == "context/0" or column == "response":
        assert(reference_set == character + "_df")
        return get_cache_testset(character, base_folder)[column]
    else:
        assert(reference_set == character + "_df" or \
               reference_set == "Common_df" or \
               (character == "Base" and column == "sampling"))
        return get_cache_predictions(reference_set, character, base_folder, column)

def perplexity_callable(reference_set, character):
    return {
        'model': get_cache_model(character),
        'encoded_test_set': get_cache_concat_and_encoded_testset(reference_set.replace("_df", ""),
                                                                 base_folder)
    }

In [21]:
def evaluate_round(queries):
    actors_pprint_map = {
        MetricActor.DATASET_CHAR: "dataset",
        MetricActor.DATASET_CHARCONTEXT: "dataset labels",
        MetricActor.DIALOGPT_GREEDY: "dialogpt (greedy)",
        MetricActor.DIALOGPT_NBEAMS: "dialogpt (nbeamns)",
        MetricActor.DIALOGPT_SAMPLE: "dialogpt (sampling)"
    }
    actor_to_column_map = {
        MetricActor.DATASET_CHARCONTEXT: 'context/0',
        MetricActor.DATASET_CHAR: 'response',
        MetricActor.DIALOGPT_GREEDY: 'greedy',
        MetricActor.DIALOGPT_NBEAMS: 'nbeams',
        MetricActor.DIALOGPT_SAMPLE: 'sampling'
    }
    results = dict()
    for i in range(len(queries)):
        try:
            query = queries[i].copy() # Since there are destructive operations
            print("#### Running Query " + str(i+1) + "/" + str(len(queries)) + " ####")
            if 'run' in query:
                query['run'](**query['run_args'])
            else:
                print("Evaluating " + query['metric_name'] + \
                      " on reference set " + query['reference_set'] + " with:")
                for actor_type, actor in query['metric_actors'].items():
                    print("\t" + actor[1] + " " + actors_pprint_map[actor[0]] + " as " + actor_type)
                # Get metric metadata data for outputting
                query_output = dict()
                query_output['metric_name'] = query['metric_name']
                query_output['metric_version'] = 1 # It's 1 for all metrics we use, anyway
                query_output['metric_attempt'] = 0 if 'metric_attempt' not in query \
                                                   else query['metric_attempt']
                query_output['metric_actors'] = query['metric_actors']
                query_output['metric_params'] = query['metric_params']
                query_output['context'] = {
                    "dialogpt_size": "small",
                    "dialogpt_context_sentences": 5,
                    "dialogpt_nbeams_beams": 3,
                    "dialogpt_sample_top_p": 0.92,
                    "dialogpt_sample_top_k": 50
                }
                query_output['metric_arity'] = get_metric_arity(query['metric_name'])
                query_output['metric_determinism'] = get_metric_determinism(query['metric_name'],
                                                                            query_output['metric_version'])
                query_output['reference_set'] = query['reference_set']
                query_hash = dict_hash({'metric_name': query_output['metric_name'],
                                        'metric_version': query_output['metric_version'],
                                        'reference_set': query_output['reference_set'],
                                        'metric_attempt': query_output['metric_attempt'],
                                        'metric_actors': query_output['metric_actors'],
                                        'context': query_output['context'],
                                        'metric_params': query_output['metric_params']})
                for key in query['metric_actors'].keys(): # Lazy fix for "_df" suffix
                    if query['metric_actors'][key][0] == MetricActor.DATASET_CHARCONTEXT or \
                        query['metric_actors'][key][0] == MetricActor.DATASET_CHAR:
                        query['metric_actors'][key] = (query['metric_actors'][key][0],
                                                       query['metric_actors'][key][1].replace("_df", ""))
                # Compute the actual metric
                if query['metric_name'] in ['google bleu', 'meteor', 'rouge l', 'mpnet embedding similarity',
                                'emotion classifier', 'distinct', 'roberta crossencoding similarity',
                                'repetitiveness', 'term error rate', 'bertscore', 'bleurt', 'bartscore',
                                'word mover distance', 't5 grammar correction edit distance',
                                'extended edit distance']:
                    args_map = {
                        'predictor': 'predictions', 'reference': 'references', 'document': 'sentences',
                        'document0': 'sentences_a', 'document1': 'sentences_b'
                    }
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = {}
                    for actor_key, actor_pair in query['metric_actors'].items():
                        args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                           actor_pair[1],
                                                                           actor_to_column_map[actor_pair[0]])
                elif query['metric_name'] == 'comet':
                    args_map = {
                        'predictor': 'predictions', 'reference': 'references', 'document': 'sources'
                    }
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = {}
                    for actor_key, actor_pair in query['metric_actors'].items():    
                        args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                           actor_pair[1],
                                                                           actor_to_column_map[actor_pair[0]])
                elif query['metric_name'] in ['perplexity']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = perplexity_callable(query['reference_set'],
                                                    actor_pair[1])
                elif query['metric_name'] in ['frequency chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'],
                                              mode=query['metric_params']['mode'])
                    del query['metric_params']['mode']
                    args_dict = {
                        'sentences': sentence_callable(query['reference_set'],
                                                       actor_pair[1],
                                                       actor_to_column_map[actor_pair[0]])
                    }
                elif query['metric_name'] in ['distilbert-embedded chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'],
                                              with_barney=query['metric_params']['with_barney'])
                    del query['metric_params']['with_barney']
                    args_dict = {
                        'sentences': sentence_callable(query['reference_set'],
                                                       actor_pair[1],
                                                       actor_to_column_map[actor_pair[0]])
                    }
                elif query['metric_name'] in ['neural chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    classifier_char = query['metric_params']['classifier_char']
                    args_dict = {
                        'character': classifier_char,
                        'load_path': os.path.join(base_folder, "Data", "Characters",
                                      classifier_char, character_dict[classifier_char]['classifier_folder']),
                    }
                    metric = get_cache_metric(query['metric_name'],
                                              classifier_char=classifier_char)
                    del query['metric_params']['classifier_char']
                    args_dict['sentences'] = sentence_callable(query['reference_set'],
                                                               actor_pair[1],
                                                               actor_to_column_map[actor_pair[0]])         
                query_output['answer'] = metric.compute(**{**args_dict, **query['metric_params']})
                results[query_hash] = query_output
        except Exception as e:
            print("Query failed due to " + str(type(e)) + " with message " + str(e))
        print()
    print("Done.")
    return results

# Example of Running an Evaluation

In [12]:
# Metric Name: See BBMetric.metrics_list
# Metric Params: See optional and require params of each metric
## NOTE: For neural chatbot classifier, add 'classifier_char' as a parameter
# Metric Actors:
## DATASET_CHARCONTEXT: (any character | "Common") + "_df"
## DATASET_CHAR: (any character | "Common") + "_df"
## DIALOGPT_GREEDY: any character | "Base"
## DIALOGPT_NBEAMS: any character | "Base"
## DIALOGPT_SAMPLE: any character | "Base"
# Reference Set: (any character | "Common") + "_df"
# Metric Attempt: Defaults to 0, add a number to save multiple runs of the same query

In [60]:
queries = [
    {
        'metric_name': 'google bleu',
        'metric_actors': {
            'predictor': (MetricActor.DATASET_CHAR, 'Vader_df'),
            'reference': (MetricActor.DATASET_CHARCONTEXT, 'Vader_df'),
        },
        'reference_set': 'Vader_df',
        'metric_params': {},
        'metric_attempt': 0
    }
]

In [61]:
evaluate_round(queries)

#### Running Query 1/1 ####
Evaluating bleurt (on reference set Vader_df) with:
	Vader_df dataset as predictor
	Vader_df dataset labels as reference


Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint C:\Users\Valerio\.cache\huggingface\metrics\bleurt\default\downloads\extracted\3d33e07d20dc36fda3d97eef6258f85c53f0aaa9906a4530ff316c246d91a357\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.

Done.


{'9901c208f2f724ef76bc34e0c39bfaad': {'metric_name': 'bleurt',
  'metric_version': 1,
  'metric_attempt': 0,
  'metric_actors': {'predictor': (<MetricActor.DATASET_CHAR: 1>, 'Vader'),
   'reference': (<MetricActor.DATASET_CHARCONTEXT: 0>, 'Vader')},
  'metric_params': {},
  'context': {'dialogpt_size': 'small',
   'dialogpt_context_sentences': 5,
   'dialogpt_nbeams_beams': 3,
   'dialogpt_sample_top_p': 0.92,
   'dialogpt_sample_top_k': 50},
  'metric_arity': <MetricArity.PAIRWISE: 2>,
  'metric_determinism': <MetricDeterminism.NEURAL: 2>,
  'reference_set': 'Vader_df',
  'answer': {'score': -1.4269872084259987, 'std': 0.29314456560953456}}}

# Run Evaluations

## Single Metrics

In [22]:
for metric in ['distinct', 'repetitiveness', 't5 grammar correction edit distance', 'flesch-kincaid index',
               'distilbert-embedded chatbot classifier', 'frequency chatbot classifier']:
    metric_pretty = BBMetric.load_metric(metric).pretty_name
    metric_params = dict()
    if metric == "distilbert-embedded chatbot classifier":
        metric_params = {'with_barney': True}
    elif metric == "frequency chatbot classifier":
        metric_params = {'mode': 'c-tf-idf'}
    results = evaluate_round([
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DATASET_CHAR, char + '_df')
            },
            'reference_set': char + '_df',
            'metric_params': {},
            'metric_attempt': 0
        } for char in characters + ["Common"]
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': {},
            'metric_attempt': 0
        } for char in characters
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': 'Common_df',
            'metric_params': {},
            'metric_attempt': 0
        } for char in characters + ["Base"]
    ])
    metric_dict = load_metric_by_name(out_folder, metric_pretty)
    metric_dict = {**metric_dict, **results}
    save_metric_by_name(out_folder, metric_pretty, metric_dict)

#### Running Query 1/26 ####
Evaluating distinct on reference set Barney_df with:
	Barney_df dataset as document


Using custom data configuration default-d05c63f64527f593
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-e57fb526fe4a3ff3.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-54c202230f70778c.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-2b7ecddd83bcebad.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-f85aa8d6528292e4.arrow


Loaded cache at ['testset', 'Barney_df']

#### Running Query 2/26 ####
Evaluating distinct on reference set Sheldon_df with:
	Sheldon_df dataset as document


Using custom data configuration default-b47497d241584694
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-029452f8e061c873.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-c14841bc6fd888d0.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-a95f39f7704e7643.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-3155a76a827528d3.arrow


Loaded cache at ['testset', 'Sheldon_df']

#### Running Query 3/26 ####
Evaluating distinct on reference set Harry_df with:
	Harry_df dataset as document


Using custom data configuration default-8734ab070ca4d4a4
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-6f2c723a222e2152.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-06309c8f05ebbafb.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-065178db47afc42d.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-b63cdf0d40382d55.arrow


Loaded cache at ['testset', 'Harry_df']

#### Running Query 4/26 ####
Evaluating distinct on reference set Fry_df with:
	Fry_df dataset as document


Using custom data configuration default-edae583082198a82
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-3a40bd254dc1230b.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-76bce446c8545bcd.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-0d49f749e9201c30.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-2361c2a0ed34c32d.arrow


Loaded cache at ['testset', 'Fry_df']

#### Running Query 5/26 ####
Evaluating distinct on reference set Bender_df with:
	Bender_df dataset as document


Using custom data configuration default-36d673def5b55b14
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-45e9a5cdc4703907.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-3d73977ab48375f4.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-29c1b91d58842d36.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-15d3f645646c43ff.arrow


Loaded cache at ['testset', 'Bender_df']

#### Running Query 6/26 ####
Evaluating distinct on reference set Vader_df with:
	Vader_df dataset as document


Using custom data configuration default-3e37c23a51e9d556
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-6affe80547fc0f38.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-6800bd34204a2976.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-78893b2d23cae2d7.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-38619cd96763b020.arrow


Loaded cache at ['testset', 'Vader_df']

#### Running Query 7/26 ####
Evaluating distinct on reference set Joey_df with:
	Joey_df dataset as document


Using custom data configuration default-bc30ecf942c9a05e
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-ba044a5350a0b4ee.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-27a2b109b2a74d44.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-5b9e07b7a11ce3d7.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-5910ff0b0ae92adb.arrow


Loaded cache at ['testset', 'Joey_df']

#### Running Query 8/26 ####
Evaluating distinct on reference set Phoebe_df with:
	Phoebe_df dataset as document


Using custom data configuration default-437509a5e3c6484b
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-f3dedb84963f27fc.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-f69b60477b60f2fe.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-ae641b942c4f1782.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-1b02de643a9ce81e.arrow


Loaded cache at ['testset', 'Phoebe_df']

#### Running Query 9/26 ####
Evaluating distinct on reference set Common_df with:
	Common_df dataset as document


Using custom data configuration default-a0e685ebee7bf9f8
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-a0e685ebee7bf9f8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loaded cache at ['testset', 'Common_df']

#### Running Query 10/26 ####
Evaluating distinct on reference set Barney_df with:
	Barney dialogpt (sampling) as document
Loading predictions from stored file
Loaded predictions from stored file
Loaded cache at ['predictions', 'Barney_df', 'Barney', 'sampling']

#### Running Query 11/26 ####
Evaluating distinct on reference set Sheldon_df with:
	Sheldon dialogpt (sampling) as document
Loading predictions from stored file
Loaded predictions from stored file
Loaded cache at ['predictions', 'Sheldon_df', 'Sheldon', 'sampling']

#### Running Query 12/26 ####
Evaluating distinct on reference set Harry_df with:
	Harry dialogpt (sampling) as document
Loading predictions from stored file
Loaded predictions from stored file
Loaded cache at ['predictions', 'Harry_df', 'Harry', 'sampling']

#### Running Query 13/26 ####
Evaluating distinct on reference set Fry_df with:
	Fry dialogpt (sampling) as document
Loading predictions from stored file
Loaded predi

Using custom data configuration default-a0e685ebee7bf9f8
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-a0e685ebee7bf9f8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Barney\barney_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:06<03:47,  6.69s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:10<02:40,  4.86s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:12<01:50,  3.44s/it]A dec

KeyboardInterrupt: 

In [None]:
metric = 'neural chatbot classifier'
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round([
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DATASET_CHAR, char + '_df')
        },
        'reference_set': char + '_df',
        'metric_params': {'classifier_char': char},
        'metric_attempt': 0
    } for char in characters + ["Common"]
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': char + '_df',
        'metric_params': {'classifier_char': char},
        'metric_attempt': 0
    } for char in characters
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': 'Common_df',
        'metric_params': {'classifier_char': char},
        'metric_attempt': 0
    } for char in characters + ["Base"]
])
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)

In [None]:
metric = 'perplexity'
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round([
    {
        'metric_name': metric,
        'metric_actors': {
            'predictor': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': char2 + '_df',
        'metric_params': {},
        'metric_attempt': 0
    } for char in characters + ["Base"]
      for char2 in characters + ["Common"]
]
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)

# COMET

In [None]:
metric = "comet"
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round([
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DATASET_CHARCONTEXT, char + '_df'),
            'reference': (MetricActor.DATASET_CHAR, char + "_df"),
            'predictor': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': char + '_df',
        'metric_params': {},
        'metric_attempt': 0
    } for char in characters
])
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)

# Pairwise Metrics

In [None]:
for metric in ['google bleu', 'meteor', 'bertscore', 'bartscore', 'bleurt', 'term error rate']:
    metric_pretty = BBMetric.load_metric(metric).pretty_name
    results = evaluate_round([
        {
            'metric_name': metric,
            'metric_actors': {
                'reference': (MetricActor.DATASET_CHAR, char + "_df"),
                'predictor': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': {},
            'metric_attempt': 0
        } for char in characters
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'reference': (MetricActor.DIALOGPT_SAMPLE, charpair[1]),
                'predictor': (MetricActor.DIALOGPT_SAMPLE, charpair[0])
            },
            'reference_set': 'Common_df',
            'metric_params': {},
            'metric_attempt': 0
        } for charpair in [('Joey', 'Phoebe'), ('Joey', 'Sheldon'), ('Bender', 'Fry'), ('Bender', 'Barney')]        
    ])
    metric_dict = load_metric_by_name(out_folder, metric_pretty)
    metric_dict = {**metric_dict, **results}
    save_metric_by_name(out_folder, pretty_name, metric_dict)
    
for metric in ['extended edit distance', 'word mover distance']:
    metric_pretty = BBMetric.load_metric(metric).pretty_name
    results = evaluate_round([
        {
            'metric_name': metric,
            'metric_actors': {
                'reference': (MetricActor.DATASET_CHAR, char + "_df"),
                'predictor': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': {},
            'metric_attempt': 0
        } for char in characters
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document0': (MetricActor.DIALOGPT_SAMPLE, charpair[1]),
                'document1': (MetricActor.DIALOGPT_SAMPLE, charpair[0])
            },
            'reference_set': 'Common_df',
            'metric_params': {},
            'metric_attempt': 0
        } for charpair in [('Joey', 'Phoebe'), ('Joey', 'Sheldon'), ('Bender', 'Fry'), ('Bender', 'Barney'),
                           ('Barney', 'Harry')]        
    ])
    metric_dict = load_metric_by_name(out_folder, metric_pretty)
    metric_dict = {**metric_dict, **results}
    save_metric_by_name(out_folder, metric_pretty, metric_dict)