# Setup

In [1]:
verbose = True
use_cuda = False

do_metric_training = False
do_predictions = False

In [2]:
### Run environment setup
import os
import lib.BBSetup as BBSetup

try:
    from google.colab import drive
    BBSetup.colab_setup(mount_folder=r"/content/drive/My Drive/unibo/NLP_project/BarneyBot")
except:
    try:
        BBSetup.anaconda_manual_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot",
                                      env_name="barneybot")
    except:
        BBSetup.anaconda_auto_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot")

### Define folders
base_folder = BBSetup.BASE_FOLDER
in_folder = BBSetup.set_folder(os.path.join(base_folder, 'Data', 'Characters'))
out_folder = BBSetup.set_folder(os.path.join(base_folder, 'Metrics', 'New'))

pip install -r "E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\requirements.txt"


In [3]:
### load_char_df() (hg dataset) ['test'] to get testset, containing contexts and response
### get_chatbot_predictions() to get a type of predictions for a model
from lib.BBDataLoad import load_char_df, get_chatbot_predictions, dialogpt_preprocess_function
from datasets import load_dataset
from transformers import TFAutoModelForCausalLM
from lib.BBMetrics import BBMetric
from lib.BBMetricResults import *
from tqdm import tqdm

from lib.BBData import character_dict, model_name, random_state
import lib.BBData as BBData
characters = list(character_dict.keys())
characters.remove('Default')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  "class": algorithms.Blowfish,
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tonel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Import structures from HuggingFace
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

In [5]:
def flatten(S):
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

In [6]:
if do_predictions:
    print("Saving predictions to file")
    with tqdm(total=len(characters)*4) as pbar:
        # Chatbot of a character on their own dataset
        for char in characters:
            checkpoint_folder = os.path.join(in_folder, char,
                                             character_dict[char]['checkpoint_folder'])
            model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
            model.compile()
            samples = load_char_df(char)
            for gen_type in ['greedy', 'nbeams', 'sampling']:
                get_chatbot_predictions(samples['test']['context/0'], model,
                              character_dict[char]['prediction_filename'] + '_' + gen_type + '.json',
                              gen_type, char, cache.tokenizer, base_folder, override_predictions=True)
                pbar.update(1)
        # Base chatbot on each character's dataset
        for char in characters:
            model = TFAutoModelForCausalLM.from_pretrained(model_name,
                                                           cache_dir=os.path.join(base_folder, "cache"))
            model.compile()
            samples = load_char_df(char)
            get_chatbot_predictions(samples['test']['context/0'], model,
                              'from_' + char + "_df__sampling.json", gen_type,
                              "Default", cache.tokenizer, base_folder, override_predictions=True)
            pbar.update(1)

In [7]:
if do_metric_training:
    print("Training metrics")
    # Neural Chatbot Classifier
    with tqdm(total=len(characters) + 2) as pbar:
        for char in tqdm(characters):
            neural_classifier = BBMetric.load_metric("neural chatbot classifier")
            neural_classifier.train(character=char, random_state=random_state,
                     source_encoded_path=None,
                     source_path=os.path.join(base_folder, "Data", "Sources",
                                              character_dict[char]['source'],
                                              character_dict[char]['source'] + ".csv"),
                     source_save_path=os.path.join(base_folder, "Data", "Characters", char),
                     save_path=os.path.join(base_folder, "Data", "Characters", char))
            pbar.update(1)
        # Distilbert-Embedded Chatbot Classifier
        bertembedded_classifier = BBMetric.load_metric("distilbert-embedded chatbot classifier")
        bertembedded_classifier.train(characters_path=os.path.join(base_folder, "Data", "Characters"),
                                      save_path=os.path.join(base_folder, "Data", "Metrics", 
                                                             "distilbert_embedder"),
                                      train_embedder=True,
                                      verbose=True)
        pbar.update(1)
        characters_no_barney = characters.copy()
        characters_no_barney.remove("Barney")
        bertembedded_classifier = BBMetric.load_metric("distilbert-embedded chatbot classifier")
        bertembedded_classifier.metric.set_characters(characters_no_barney)
        bertembedded_classifier.train(characters_path=os.path.join(base_folder, "Data", "Characters"),
                                      save_path=os.path.join(base_folder, "Data", "Metrics", 
                                                             "distilbert_embedder_nobarney"),
                                      train_embedder=True,
                                      verbose=True)
        pbar.update(1)
else:
    print("Skipping Metrics training.")

Skipping Metrics training.


# Cache System Creation

In [8]:
# Define a cache structure to avoid reloading stuff
from types import SimpleNamespace

cache = {
    'dialogpt': {char: None for char in characters + ["Base"]},
    'tokenizer': None,
    'datacollator': None,
    'trained_metric': {
        'neural chatbot classifier': {char: None for char in characters},
        'frequency chatbot classifier': {'c-tf-idf': None, 'tf-idf': None, 'word frequency': None},
        'distilbert-embedded chatbot classifier': {'Full': None, 'No Barney': None}
    },
    'testset': {char + "_df": None for char in characters + ["Common"]},
    'concat_and_encoded_testset': {char + "_df": None for char in characters + ["Common"]},
    'predictions': {
        char + "_df": { # Dataset
            char: { # Chatbot
                'greedy': None,
                'nbeams': None,
                'sampling': None
            } for char in characters + ["Base"]
        } for char in characters + ["Common"]
    },
}
cache = SimpleNamespace(**cache)

def load_cache_entry(value, entry):
    pointer = cache
    for i in range(len(entry)-1):
        val = entry[i]
        if isinstance(pointer, dict):
            pointer = pointer[val]
        elif isinstance(pointer, SimpleNamespace):
            pointer = pointer.__dict__[val]
        else:
            raise Exception()
    if not pointer[entry[-1]]:
        pointer[entry[-1]] = value
        if verbose:
            print("Loaded cache at " + str(entry))
    return pointer[entry[-1]]

def flush_cache_entries(entries):
    for entry in entries:
        pointer = cache
        for i in range(len(entry)-1):
            val = entry[i]
            if isinstance(pointer, dict):
                pointer = pointer[val]
            elif isinstance(pointer, SimpleNamespace):
                pointer = pointer.__dict__[val]
            else:
                raise Exception()
        pointer[entry[-1]] = None
        if verbose:
            print("Flushed cache at " + str(entry))

In [9]:
def get_cache_testset(character, base_folder):
    if not cache.testset[character + "_df"]:
        if character != "Common":
            df = load_char_df(character, base_folder)['test']
        else: 
            df = load_dataset('csv',
                     data_files=os.path.join(base_folder, 'Data', 'Sources', 'common_dataset.csv'), 
                     cache_dir=os.path.join(base_folder, "cache"))['train']
        load_cache_entry(df, ['testset', character + "_df"])
    return cache.testset[character + "_df"]

# For perplexity
def get_cache_concat_and_encoded_testset(character, base_folder):
    if not cache.concat_and_encoded_testset[character + "_df"]:
        testset = get_cache_testset(character, base_folder)
        concat_encoded_testset = testset.map(lambda row: dialogpt_preprocess_function(row,
                                                                            cache.tokenizer),
                                             batched=False)
        concat_encoded_testset = concat_encoded_testset.to_tf_dataset(
            columns=["input_ids", "attention_mask", "labels"],
            shuffle=False,
            batch_size=8,
            collate_fn=cache.datacollator,
        )
        load_cache_entry(concat_encoded_testset, ['concat_and_encoded_testset', character + "_df"])
    return cache.concat_and_encoded_testset[character + "_df"]

def get_cache_predictions(dataset_from, character, base_folder, gen_type):
    if not cache.predictions[dataset_from][character][gen_type]:
        if dataset_from == character + "_df":
            if character != "Base":
                predictions_tk = get_chatbot_predictions(None, None,
                      character_dict[character]['prediction_filename'] + '_' + gen_type + '.json',
                      None, character, None, base_folder, override_predictions=False)
            else:
                predictions_tk = get_chatbot_predictions(None, None,
                      'from_' + dataset_from + '__' + gen_type + '.json',
                      None, 'Default', None, base_folder, override_predictions=False)
        elif dataset_from == "Common_df":
            df = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'Sources', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))
            df = df.remove_columns(['source'])
            model = get_cache_model(character)
            predictions_tk = get_chatbot_predictions(df['train']['context/0'], model,
                  "", gen_type, character, cache.tokenizer, base_folder, file_caching=False, override_predictions=False)            
        else:
            raise NotImplementedError("Unexpected predictions to load!")
        predictions = []
        for line in predictions_tk:
            predictions.append(cache.tokenizer.decode(line, skip_special_tokens=True))
        load_cache_entry(predictions, ['predictions', dataset_from, character, gen_type])
    return cache.predictions[dataset_from][character][gen_type]

# For metrics worth caching, in particular the chatbot classifiers
def get_cache_metric(metric_name, **kwargs):
    classifier_char = None if 'classifier_char' not in kwargs else kwargs['classifier_char']
    mode = None if 'mode' not in kwargs else kwargs['mode']
    with_barney = None if 'with_barney' not in kwargs else kwargs['with_barney']
    with_barney = 'Full' if with_barney else 'No Barney'
    if metric_name in cache.trained_metric:
        if metric_name == "neural chatbot classifier":
            if not cache.trained_metric[metric_name][classifier_char]:
                cache.trained_metric[metric_name][classifier_char] = BBMetric.load_metric(metric_name)
                cache.trained_metric[metric_name][classifier_char].compute( # Dummy round for caching
                    character=classifier_char,
                    load_path=os.path.join(base_folder, "Data", "Characters",
                              classifier_char, character_dict[classifier_char]['classifier_folder']),
                    sentences=["Hi", "Hello", "How"])
            return cache.trained_metric[metric_name][classifier_char]
        elif metric_name == "frequency chatbot classifier":
            if not cache.trained_metric[metric_name][mode]:
                cache.trained_metric[metric_name][mode] = BBMetric.load_metric(metric_name)
                cache.trained_metric[metric_name][mode].train(
                    characters_path=os.path.join(base_folder, "Data", "Characters"),
                    mode=mode)
            return cache.trained_metric[metric_name][mode]
        elif metric_name == "distilbert-embedded chatbot classifier":
            if not cache.trained_metric[metric_name][with_barney]:
                if with_barney == 'Full':
                    cache.trained_metric[metric_name][with_barney] = BBMetric.load_metric(metric_name,
                                embedder_path=os.path.join(base_folder, "Data", "Metrics", 
                                                           "distilbert_embedder"),
                                from_pretrained=True, use_cuda=use_cuda)
                    cache.trained_metric[metric_name][with_barney].train(
                        characters_path=os.path.join(base_folder, "Data", "Characters"),
                        save_path=None, train_embedder=False
                    )
                else:
                    cache.trained_metric[metric_name][with_barney] = BBMetric.load_metric(metric_name,
                                embedder_path=os.path.join(base_folder, "Data", "Metrics", 
                                                           "distilbert_embedder_nobarney"),
                                from_pretrained=True, use_cuda=use_cuda)
                    cache.trained_metric[metric_name][with_barney].train(
                        characters_path=os.path.join(base_folder, "Data", "Characters"),
                        save_path=None, train_embedder=False
                    )
            return cache.trained_metric[metric_name][with_barney]
    else:
        return BBMetric.load_metric(metric_name)

def get_cache_model(character):
    if character == "Base":
        model = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
    else:
        checkpoint_folder = os.path.join(in_folder, character, character_dict[character]['checkpoint_folder'])
        model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
    model.compile()
    cache.dialogpt[character] = model
    return cache.dialogpt[character]

cache.tokenizer = tokenizer
cache.datacollator = data_collator

# Evaluation Process Definition

In [10]:
def sentence_callable(reference_set, character, column):
    if column == "context/0" or column == "response":
        assert(reference_set == character + "_df")
        return get_cache_testset(character, base_folder)[column]
    else:
        assert(reference_set == character + "_df" or \
               reference_set == "Common_df" or \
               (character == "Base" and column == "sampling"))
        return get_cache_predictions(reference_set, character, base_folder, column)

def perplexity_callable(reference_set, character):
    return {
        'model': get_cache_model(character),
        'encoded_test_set': get_cache_concat_and_encoded_testset(reference_set.replace("_df", ""),
                                                                 base_folder)
    }

In [11]:
def evaluate_round(queries):
    actors_pprint_map = {
        MetricActor.DATASET_CHAR: "dataset",
        MetricActor.DATASET_CHARCONTEXT: "dataset labels",
        MetricActor.DIALOGPT_GREEDY: "dialogpt (greedy)",
        MetricActor.DIALOGPT_NBEAMS: "dialogpt (nbeamns)",
        MetricActor.DIALOGPT_SAMPLE: "dialogpt (sampling)"
    }
    actor_to_column_map = {
        MetricActor.DATASET_CHARCONTEXT: 'context/0',
        MetricActor.DATASET_CHAR: 'response',
        MetricActor.DIALOGPT_GREEDY: 'greedy',
        MetricActor.DIALOGPT_NBEAMS: 'nbeams',
        MetricActor.DIALOGPT_SAMPLE: 'sampling'
    }
    results = dict()
    for i in range(len(queries)):
        try:
            query = queries[i].copy() # Since there are destructive operations
            print("#### Running Query " + str(i+1) + "/" + str(len(queries)) + " ####")
            if 'run' in query:
                query['run'](**query['run_args'])
            else:
                print("Evaluating " + query['metric_name'] + \
                      " on reference set " + query['reference_set'] + " with:")
                for actor_type, actor in query['metric_actors'].items():
                    print("\t" + actor[1] + " " + actors_pprint_map[actor[0]] + " as " + actor_type)
                # Get metric metadata data for outputting
                query_output = dict()
                query_output['metric_name'] = query['metric_name']
                query_output['metric_version'] = 1 if 'metric_version' not in query else query['metric_version']
                query_output['metric_attempt'] = 0 if 'metric_attempt' not in query else query['metric_attempt']
                query_output['metric_actors'] = query['metric_actors']
                query_output['metric_params'] = query['metric_params']
                query_output['context'] = {
                    "dialogpt_size": "small",
                    "dialogpt_context_sentences": BBData.context_n,
                    "dialogpt_nbeams_beams": BBData.n_beams,
                    "dialogpt_sample_top_p": BBData.top_p,
                    "dialogpt_sample_top_k": BBData.top_k
                }
                query_output['metric_arity'] = get_metric_arity(query['metric_name'])
                query_output['metric_determinism'] = get_metric_determinism(query['metric_name'],
                                                                            query_output['metric_version'])
                query_output['reference_set'] = query['reference_set']
                query_hash = dict_hash({'metric_name': query_output['metric_name'],
                                        'metric_version': query_output['metric_version'],
                                        'reference_set': query_output['reference_set'],
                                        'metric_attempt': query_output['metric_attempt'],
                                        'metric_actors': query_output['metric_actors'],
                                        'context': query_output['context'],
                                        'metric_params': query_output['metric_params']})
                for key in query['metric_actors'].keys(): # Lazy fix for "_df" suffix
                    if query['metric_actors'][key][0] == MetricActor.DATASET_CHARCONTEXT or \
                        query['metric_actors'][key][0] == MetricActor.DATASET_CHAR:
                        query['metric_actors'][key] = (query['metric_actors'][key][0],
                                                       query['metric_actors'][key][1].replace("_df", ""))
                # Compute the actual metric
                if query['metric_name'] in ['google bleu', 'meteor', 'rouge l', 'mpnet embedding similarity',
                                'emotion classifier', 'distinct', 'roberta crossencoding similarity',
                                'repetitiveness', 'term error rate', 'bertscore', 'bleurt', 'bartscore',
                                'word mover distance', 't5 grammar correction edit distance',
                                'extended edit distance', 'flesch-kincaid index']:
                    args_map = {
                        'predictor': 'predictions', 'reference': 'references', 'document': 'sentences',
                        'document0': 'sentences_a', 'document1': 'sentences_b'
                    }
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = {}
                    for actor_key, actor_pair in query['metric_actors'].items():
                        args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                           actor_pair[1],
                                                                           actor_to_column_map[actor_pair[0]])
                elif query['metric_name'] == 'comet':
                    args_map = {
                        'predictor': 'predictions', 'reference': 'references', 'document': 'sources'
                    }
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = {}
                    for actor_key, actor_pair in query['metric_actors'].items():    
                        args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                           actor_pair[1],
                                                                           actor_to_column_map[actor_pair[0]])
                elif query['metric_name'] in ['perplexity']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = perplexity_callable(query['reference_set'],
                                                    actor_pair[1])
                elif query['metric_name'] in ['frequency chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'],
                                              mode=query['metric_params']['mode'])
                    del query['metric_params']['mode']
                    args_dict = {
                        'sentences': sentence_callable(query['reference_set'],
                                                       actor_pair[1],
                                                       actor_to_column_map[actor_pair[0]])
                    }
                elif query['metric_name'] in ['distilbert-embedded chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'],
                                              with_barney=query['metric_params']['with_barney'])
                    del query['metric_params']['with_barney']
                    args_dict = {
                        'sentences': sentence_callable(query['reference_set'],
                                                       actor_pair[1],
                                                       actor_to_column_map[actor_pair[0]])
                    }
                elif query['metric_name'] in ['neural chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    classifier_char = query['metric_params']['classifier_char']
                    args_dict = {
                        'character': classifier_char,
                        'load_path': os.path.join(base_folder, "Data", "Characters",
                                      classifier_char, character_dict[classifier_char]['classifier_folder']),
                    }
                    metric = get_cache_metric(query['metric_name'],
                                              classifier_char=classifier_char)
                    del query['metric_params']['classifier_char']
                    args_dict['sentences'] = sentence_callable(query['reference_set'],
                                                               actor_pair[1],
                                                               actor_to_column_map[actor_pair[0]])         
                query_output['answer'] = metric.compute(**{**args_dict, **query['metric_params']})
                results[query_hash] = query_output
        except Exception as e:
            print("Query failed due to " + str(type(e)) + " with message " + str(e))
        print()
    print("Done.")
    return results

# Example of Running an Evaluation

In [12]:
# Metric Name: See BBMetric.metrics_list
# Metric Params: See optional and require params of each metric
## NOTE: For neural chatbot classifier, add 'classifier_char' as a parameter
# Metric Actors:
## DATASET_CHARCONTEXT: (any character | "Common") + "_df"
## DATASET_CHAR: (any character | "Common") + "_df"
## DIALOGPT_GREEDY: any character | "Base"
## DIALOGPT_NBEAMS: any character | "Base"
## DIALOGPT_SAMPLE: any character | "Base"
# Reference Set: (any character | "Common") + "_df"
# Metric Attempt: Defaults to 0, add a number to save multiple runs of the same query

In [13]:
queries = [
    {
        'metric_name': 'google bleu',
        'metric_actors': {
            'predictor': (MetricActor.DATASET_CHAR, 'Vader_df'),
            'reference': (MetricActor.DATASET_CHARCONTEXT, 'Vader_df'),
        },
        'reference_set': 'Vader_df',
        'metric_params': {},
        'metric_attempt': 0
    }
]

In [14]:
#evaluate_round(queries)

# Run Evaluations

## Single Metrics

In [15]:
'''
for metric in ['distilbert-embedded chatbot classifier', 'frequency chatbot classifier', 'emotion classifier',
               'distinct', 'repetitiveness', 't5 grammar correction edit distance', 'flesch-kincaid index']:
    metric_pretty = BBMetric.load_metric(metric).pretty_name
    metric_params = dict()
    if metric == "distilbert-embedded chatbot classifier":
        metric_params = {'with_barney': True}
    elif metric == "frequency chatbot classifier":
        metric_params = {'mode': 'c-tf-idf'}
    results = evaluate_round([
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DATASET_CHAR, char + '_df')
            },
            'reference_set': char + '_df',
            'metric_params': metric_params.copy(),
            'metric_attempt': 0
        } for char in characters + ["Common"]
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': metric_params.copy(),
            'metric_attempt': 0
        } for char in characters
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': 'Common_df',
            'metric_params': metric_params.copy(),
            'metric_attempt': 0
        } for char in characters + ["Base"]
    ])
    metric_dict = load_metric_by_name(out_folder, metric_pretty)
    metric_dict = {**metric_dict, **results}
    save_metric_by_name(out_folder, metric_pretty, metric_dict)
'''

'\nfor metric in [\'distilbert-embedded chatbot classifier\', \'frequency chatbot classifier\', \'emotion classifier\',\n               \'distinct\', \'repetitiveness\', \'t5 grammar correction edit distance\', \'flesch-kincaid index\']:\n    metric_pretty = BBMetric.load_metric(metric).pretty_name\n    metric_params = dict()\n    if metric == "distilbert-embedded chatbot classifier":\n        metric_params = {\'with_barney\': True}\n    elif metric == "frequency chatbot classifier":\n        metric_params = {\'mode\': \'c-tf-idf\'}\n    results = evaluate_round([\n        {\n            \'metric_name\': metric,\n            \'metric_actors\': {\n                \'document\': (MetricActor.DATASET_CHAR, char + \'_df\')\n            },\n            \'reference_set\': char + \'_df\',\n            \'metric_params\': metric_params.copy(),\n            \'metric_attempt\': 0\n        } for char in characters + ["Common"]\n    ] + [\n        {\n            \'metric_name\': metric,\n           

In [16]:
'''
metric = 'neural chatbot classifier'
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round(flatten([[
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DATASET_CHAR, char + '_df')
            },
            'reference_set': char + '_df',
            'metric_params': {'classifier_char': char},
            'metric_attempt': 0
        },
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': {'classifier_char': char},
            'metric_attempt': 0
        },
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': 'Common_df',
            'metric_params': {'classifier_char': char},
            'metric_attempt': 0
        },
        {
            'run': flush_cache_entries,
            'run_args': {
                'entries': [['trained_metric', 'neural chatbot classifier', char]]
            }
        }
] for char in characters
]))
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)
'''

"\nmetric = 'neural chatbot classifier'\nmetric_pretty = BBMetric.load_metric(metric).pretty_name\nmetric_params = dict()\nresults = evaluate_round(flatten([[\n        {\n            'metric_name': metric,\n            'metric_actors': {\n                'document': (MetricActor.DATASET_CHAR, char + '_df')\n            },\n            'reference_set': char + '_df',\n            'metric_params': {'classifier_char': char},\n            'metric_attempt': 0\n        },\n        {\n            'metric_name': metric,\n            'metric_actors': {\n                'document': (MetricActor.DIALOGPT_SAMPLE, char)\n            },\n            'reference_set': char + '_df',\n            'metric_params': {'classifier_char': char},\n            'metric_attempt': 0\n        },\n        {\n            'metric_name': metric,\n            'metric_actors': {\n                'document': (MetricActor.DIALOGPT_SAMPLE, char)\n            },\n            'reference_set': 'Common_df',\n            'metric_

In [17]:
'''
metric = 'perplexity'
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round([
    {
        'metric_name': metric,
        'metric_actors': {
            'predictor': (MetricActor.DIALOGPT_SAMPLE, charpair[0])
        },
        'reference_set': charpair[1] + '_df',
        'metric_params': {},
        'metric_attempt': 0
    } for charpair in [('Joey', 'Phoebe'), ('Joey', 'Sheldon'), ('Bender', 'Fry'), ('Bender', 'Barney'),
                       ('Barney', 'Harry')]
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': char + '_df',
        'metric_params': metric_params.copy(),
        'metric_attempt': 0
    } for char in characters
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, "Base")
        },
        'reference_set': char + '_df',
        'metric_params': metric_params.copy(),
        'metric_attempt': 0
    } for char in characters
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': 'Common_df',
        'metric_params': metric_params.copy(),
        'metric_attempt': 0
    } for char in characters + ["Base"]
])
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)
'''

'\nmetric = \'perplexity\'\nmetric_pretty = BBMetric.load_metric(metric).pretty_name\nmetric_params = dict()\nresults = evaluate_round([\n    {\n        \'metric_name\': metric,\n        \'metric_actors\': {\n            \'predictor\': (MetricActor.DIALOGPT_SAMPLE, charpair[0])\n        },\n        \'reference_set\': charpair[1] + \'_df\',\n        \'metric_params\': {},\n        \'metric_attempt\': 0\n    } for charpair in [(\'Joey\', \'Phoebe\'), (\'Joey\', \'Sheldon\'), (\'Bender\', \'Fry\'), (\'Bender\', \'Barney\'),\n                       (\'Barney\', \'Harry\')]\n] + [\n    {\n        \'metric_name\': metric,\n        \'metric_actors\': {\n            \'document\': (MetricActor.DIALOGPT_SAMPLE, char)\n        },\n        \'reference_set\': char + \'_df\',\n        \'metric_params\': metric_params.copy(),\n        \'metric_attempt\': 0\n    } for char in characters\n] + [\n    {\n        \'metric_name\': metric,\n        \'metric_actors\': {\n            \'document\': (MetricActo

# COMET

In [18]:
'''
metric = "comet"
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round([
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DATASET_CHARCONTEXT, char + '_df'),
            'reference': (MetricActor.DATASET_CHAR, char + "_df"),
            'predictor': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': char + '_df',
        'metric_params': {},
        'metric_attempt': 0
    } for char in characters
])
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)
'''

'\nmetric = "comet"\nmetric_pretty = BBMetric.load_metric(metric).pretty_name\nmetric_params = dict()\nresults = evaluate_round([\n    {\n        \'metric_name\': metric,\n        \'metric_actors\': {\n            \'document\': (MetricActor.DATASET_CHARCONTEXT, char + \'_df\'),\n            \'reference\': (MetricActor.DATASET_CHAR, char + "_df"),\n            \'predictor\': (MetricActor.DIALOGPT_SAMPLE, char)\n        },\n        \'reference_set\': char + \'_df\',\n        \'metric_params\': {},\n        \'metric_attempt\': 0\n    } for char in characters\n])\nmetric_dict = load_metric_by_name(out_folder, metric_pretty)\nmetric_dict = {**metric_dict, **results}\nsave_metric_by_name(out_folder, metric_pretty, metric_dict)\n'

# Pairwise Metrics

In [19]:
'''
for metric in ['bartscore', 'bleurt', 'bertscore', 'roberta crossencoding similarity',
               'rouge l', 'google bleu', 'meteor', 'bertscore', 'bleurt', 'term error rate']:
    metric_pretty = BBMetric.load_metric(metric).pretty_name
    results = evaluate_round([
        {
            'metric_name': metric,
            'metric_actors': {
                'reference': (MetricActor.DATASET_CHAR, char + "_df"),
                'predictor': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': {},
            'metric_attempt': 0
        } for char in characters
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'reference': (MetricActor.DIALOGPT_SAMPLE, charpair[1]),
                'predictor': (MetricActor.DIALOGPT_SAMPLE, charpair[0])
            },
            'reference_set': 'Common_df',
            'metric_params': {},
            'metric_attempt': 0
        } for charpair in [('Joey', 'Phoebe'), ('Joey', 'Sheldon'), ('Bender', 'Fry'), ('Bender', 'Barney')]        
    ])
    metric_dict = load_metric_by_name(out_folder, metric_pretty)
    metric_dict = {**metric_dict, **results}
    save_metric_by_name(out_folder, metric_pretty, metric_dict)

for metric in ['extended edit distance', 'word mover distance', 'mpnet embedding similarity']:
    metric_pretty = BBMetric.load_metric(metric).pretty_name
    results = evaluate_round([
        {
            'metric_name': metric,
            'metric_actors': {
                'document0': (MetricActor.DATASET_CHAR, char + "_df"),
                'document1': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': {},
            'metric_attempt': 0
        } for char in characters
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document0': (MetricActor.DIALOGPT_SAMPLE, charpair[1]),
                'document1': (MetricActor.DIALOGPT_SAMPLE, charpair[0])
            },
            'reference_set': 'Common_df',
            'metric_params': {},
            'metric_attempt': 0
        } for charpair in [('Joey', 'Phoebe'), ('Joey', 'Sheldon'), ('Bender', 'Fry'), ('Bender', 'Barney'),
                           ('Barney', 'Harry')]        
    ])
    metric_dict = load_metric_by_name(out_folder, metric_pretty)
    metric_dict = {**metric_dict, **results}
    save_metric_by_name(out_folder, metric_pretty, metric_dict)
'''

'\nfor metric in [\'bartscore\', \'bleurt\', \'bertscore\', \'roberta crossencoding similarity\',\n               \'rouge l\', \'google bleu\', \'meteor\', \'bertscore\', \'bleurt\', \'term error rate\']:\n    metric_pretty = BBMetric.load_metric(metric).pretty_name\n    results = evaluate_round([\n        {\n            \'metric_name\': metric,\n            \'metric_actors\': {\n                \'reference\': (MetricActor.DATASET_CHAR, char + "_df"),\n                \'predictor\': (MetricActor.DIALOGPT_SAMPLE, char)\n            },\n            \'reference_set\': char + \'_df\',\n            \'metric_params\': {},\n            \'metric_attempt\': 0\n        } for char in characters\n    ] + [\n        {\n            \'metric_name\': metric,\n            \'metric_actors\': {\n                \'reference\': (MetricActor.DIALOGPT_SAMPLE, charpair[1]),\n                \'predictor\': (MetricActor.DIALOGPT_SAMPLE, charpair[0])\n            },\n            \'reference_set\': \'Common_df\',

## 10-Sentences Ranking

In [20]:
test_10 = dict()

for char in ['Barney', 'Vader']:
    test_10[char + '_model'] = get_cache_model(char)
    if char == 'Barney':
        test_10[char + '_context'] = get_cache_testset('Common', base_folder)['context/0'][0]
        test_10[char + '_label'] = get_cache_testset('Common', base_folder)['response'][0]
    elif char == 'Vader':
        test_10[char + '_context'] = get_cache_testset('Common', base_folder)['context/0'][20]
        test_10[char + '_label'] = get_cache_testset('Common', base_folder)['response'][20]
    test_10[char + '_responses'] = list()
    for i in tqdm(range(10)):
        tokenized_question = cache.tokenizer.encode(test_10[char + '_context'] + cache.tokenizer.eos_token,
                                                    return_tensors='tf')
        max_length = 128 + tokenized_question.shape[1]
        generated_answer = test_10[char + '_model'].generate(
                            tokenized_question,
                            pad_token_id=cache.tokenizer.eos_token_id,
                            max_length=max_length,
                            do_sample=True,
                            top_k=BBData.top_k,
                            top_p=BBData.top_p)[0].numpy().tolist()
        generated_answer = generated_answer[len(tokenized_question[0]):]
        test_10[char + '_responses'].append(cache.tokenizer.decode(generated_answer, skip_special_tokens=True))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Barney\barney_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a6

  0%|          | 0/1 [00:00<?, ?it/s]

Loaded cache at ['testset', 'Common_df']


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 10%|████████▎                                                                          | 1/10 [00:03<00:27,  3.11s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 20%|████████████████▌                                                                  | 2/10 [00:04<00:18,  2.32s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 30%|████████████████████████▉                                                          | 3/10 [00:07<00:16,  2.39s/it]A dec

In [21]:
for char in ['Barney', 'Vader']:
    print(char.upper())
    print("\t-Context-")
    print("\t\t* " + test_10[char + '_context'])
    print("\t-Answers-")
    for elem in test_10[char + '_responses']:
        print('\t\t* ' + elem)
    print()

BARNEY
	-Context-
		* Barney, this is about the building.
	-Answers-
		* No way!
		* No, I said no.
		* I know it's a clich, but...
		* What building?
		* The building?
		* Okay, I guess I'll be taking it easy.
		* The building?
		* Ted. You okay?
		* Ted, your plan was flawless. Ted, I'm done with that. You've been a rock for six years. Six months, you're out of warranty. I call the fire department.
		* Ted, these people are ridiculous.

VADER
	-Context-
		* I will not fight you.
	-Answers-
		* There is no doubt in my mind. I'm the Master. I will not fight you!
		* This is a trap!
		* You are too weak.
		* You are not a true Jedi until you have beaten The Senate.
		* Your men will not stand for this!
		* You must not think about me.
		* You have just accepted your fate.
		* Lord Vader, when you meet me again, I will not turn myself to your men.
		* My father.
		* Don't fight me.



In [None]:
metric_filename = "10 Sentences Ranking.json"

metric_dict = {
    'test_additional_data' : {
        'generated_sentences': {
            'Barney': test_10['Barney_responses'],
            'Vader': test_10['Vader_responses']
        }
    }
}

for char in ['Barney', 'Vader']:
    for metric_name in tqdm(BBMetric.metrics_list):
        print("Computing " + metric_name + " for character " + char)
        query_output = dict()
        query_output['metric_name'] = metric_name
        query_output['metric_version'] = 1
        query_output['metric_attempt'] = 0
        query_output['context'] = {
            "dialogpt_size": "small",
            "dialogpt_context_sentences": BBData.context_n,
            "dialogpt_nbeams_beams": BBData.n_beams,
            "dialogpt_sample_top_p": BBData.top_p,
            "dialogpt_sample_top_k": BBData.top_k
        }
        query_output['metric_arity'] = get_metric_arity(metric_name)
        query_output['metric_determinism'] = get_metric_determinism(metric_name, 1)
        query_output['reference_set'] = [test_10[char + '_context']]
        if metric_name in ['bartscore', 'rouge l', 'google bleu', 'meteor', 'bertscore', 'bleurt', 'term error rate',
                           'bleurt', 'bertscore', 'roberta crossencoding similarity']:
            metric = get_cache_metric(metric_name)
            metric_params = dict()
            query_output['metric_actors'] = {
                'predictor': (MetricActor.DIALOGPT_SAMPLE, char),
                'reference': [test_10[char + '_label']]
            }
            compute_args = [{
                'predictions': sentence,
                'references': test_10[char + '_label']
            } for sentence in test_10[char + '_responses']]
        elif metric_name in ['extended edit distance', 'word mover distance', 'mpnet embedding similarity']:
            metric = get_cache_metric(metric_name)
            metric_params = dict()
            query_output['metric_actors'] = {
                'document0': (MetricActor.DIALOGPT_SAMPLE, char),
                'document1': [test_10[char + '_label']]
            }
            compute_args = [{
                'sentences_a': sentence,
                'sentences_b': test_10[char + '_label']
            } for sentence in test_10[char + '_responses']]
        elif metric_name in ['distilbert-embedded chatbot classifier', 'frequency chatbot classifier', 'emotion classifier',
                             'distinct', 'repetitiveness', 't5 grammar correction edit distance', 'flesch-kincaid index']:
            if metric_name == 'frequency chatbot classifier':
                metric = get_cache_metric(metric_name, mode='c-tf-idf')
                metric_params = {'mode': 'c-tf-idf'}
            elif metric_name == 'distilbert-embedded chatbot classifier':
                metric = get_cache_metric(metric_name, with_barney=True)
                metric_params = {'with_barney': True}
            metric = get_cache_metric(metric_name)
            metric_params = dict()
            query_output['metric_actors'] = {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            }
            compute_args = [{
                'sentences': sentence
            } for sentence in test_10[char + '_responses']]
        elif metric_name == 'perplexity':
            print("Skipping Perplexity.")
            continue
        elif metric_name == 'neural chatbot classifier':
            metric = get_cache_metric(metric_name, classifier_char=char)
            metric_params = {'classifier_char': char}
            query_output['metric_actors'] = {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            }
            responses_n = len(test_10[char + '_responses'])
            compute_args = [{
                'sentences': [test_10[char + '_responses'][i],
                              test_10[char + '_responses'][(i+1) % responses_n],
                              test_10[char + '_responses'][(i+2) % responses_n]],
                'load_path': os.path.join(base_folder, "Data", "Characters", char, character_dict[char]['classifier_folder']),
                'character': char
            } for i in range(responses_n)]
        elif metric_name == 'comet':
            metric = get_cache_metric(metric_name)
            metric_params = dict()
            query_output['metric_actors'] = {
                'document': [test_10[char + '_context']],
                'predictor': (MetricActor.DIALOGPT_SAMPLE, char),
                'reference': [test_10[char + '_label']]
            }
            compute_args = [{
                'sources': test_10[char + '_context'],
                'predictions': sentence,
                'references': test_10[char + '_label']
            } for sentence in test_10[char + '_responses']]
        query_output['metric_params'] = metric_params
        query_hash = dict_hash({'metric_name': query_output['metric_name'],
                        'metric_version': query_output['metric_version'],
                        'reference_set': query_output['reference_set'],
                        'metric_attempt': query_output['metric_attempt'],
                        'metric_actors': query_output['metric_actors'],
                        'context': query_output['context'],
                        'metric_params': query_output['metric_params']})
        results = [metric.compute(**args) for args in compute_args]
        query_output['answer'] = results
        metric_dict = {**metric_dict, **query_output}

save_metric_by_name(os.path.join(out_folder, 'Advanced Tests'), metric_filename, metric_dict)

  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Computing google bleu for character Barney


  5%|███▉                                                                               | 1/21 [00:01<00:37,  1.88s/it]

Computing mpnet embedding similarity for character Barney


 10%|███████▉                                                                           | 2/21 [00:26<04:47, 15.15s/it]

Computing rouge l for character Barney


 14%|███████████▊                                                                       | 3/21 [00:29<02:54,  9.68s/it]

Computing meteor for character Barney


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tonel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tonel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tonel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
 19%|███████████████▊                                                                   | 4/21 [00:30<01:48,  6.36s/it]

Computing emotion classifier for character Barney


 24%|███████████████████▊                                                               | 5/21 [00:32<01:17,  4.83s/it]

Computing roberta crossencoding similarity for character Barney


 33%|███████████████████████████▋                                                       | 7/21 [00:43<01:04,  4.63s/it]

Computing distinct for character Barney
Computing neural chatbot classifier for character Barney


 38%|███████████████████████████████▌                                                   | 8/21 [00:45<00:50,  3.89s/it]

Computing perplexity for character Barney
Skipping Perplexity.
Computing repetitiveness for character Barney


 48%|███████████████████████████████████████                                           | 10/21 [00:46<00:23,  2.09s/it]

Computing term error rate for character Barney


 52%|██████████████████████████████████████████▉                                       | 11/21 [00:47<00:18,  1.89s/it]

Computing bertscore for character Barney


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 57%|██████████████████████████████████████████████▊                                   | 12/21 [00:53<00:27,  3.06s/it]

Computing comet for character Barney


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU 

 62%|██████████████████████████████████████████████████▊                               | 13/21 [01:04<00:40,  5.04s/it]

Computing bleurt for character Barney




INFO:tensorflow:Reading checkpoint C:\Users\tonel\.cache\huggingface\metrics\bleurt\default\downloads\extracted\b094b72f3dc7e1712a641ab624024c3b182ff714848ee334f1cc7a628d0b7798\bleurt-base-128.


INFO:tensorflow:Reading checkpoint C:\Users\tonel\.cache\huggingface\metrics\bleurt\default\downloads\extracted\b094b72f3dc7e1712a641ab624024c3b182ff714848ee334f1cc7a628d0b7798\bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


Computing word mover distance for character Barney
Computing bartscore for character Barney


 81%|██████████████████████████████████████████████████████████████████▍               | 17/21 [01:36<00:25,  6.42s/it]

Computing extended edit distance for character Barney
Computing t5 grammar correction edit distance for character Barney


 86%|██████████████████████████████████████████████████████████████████████▎           | 18/21 [01:58<00:31, 10.57s/it]

Computing distilbert-embedded chatbot classifier for character Barney
