# Setup

In [1]:
# Parameters
verbose = True
use_cuda = False

do_metric_training = False
do_predictions = False

In [2]:
### Run environment setup
import os
import lib.BBSetup as BBSetup

try:
    from google.colab import drive
    BBSetup.colab_setup(mount_folder=r"/content/drive/My Drive/unibo/NLP_project/BarneyBot")
except:
    try:
        BBSetup.anaconda_manual_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot",
                                      env_name="barneybot")
    except:
        BBSetup.anaconda_auto_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot")

### Define folders
base_folder = BBSetup.BASE_FOLDER
in_folder = BBSetup.set_folder(os.path.join(base_folder, 'Data', 'Characters'))
out_folder = BBSetup.set_folder(os.path.join(base_folder, 'Metrics', 'New'))

pip install -r "E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\requirements.txt"
pip install -r "E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\requirements.txt"


In [3]:
# Various necessary imports
from lib.BBDataLoad import load_char_df, get_chatbot_predictions, dialogpt_preprocess_function
from datasets import load_dataset
from transformers import TFAutoModelForCausalLM
from lib.BBMetrics import BBMetric
from lib.BBMetricResults import *
from tqdm import tqdm

# Get the list of characters, removing the Default one
from lib.BBData import character_dict, model_name, random_state
import lib.BBData as BBData
characters = list(character_dict.keys())
characters.remove('Default')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tonel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Import structures from HuggingFace
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling

# Load the DialoGPT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

Downloading: 100%|██████████████████████████████████████████████████████████████████████████| 26.0/26.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|████████████████████████████████████████████████████████████████████████████| 641/641 [00:00<?, ?B/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:01<00:00, 582kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 472kB/s]


In [5]:
# Simple function to recursively flatten a list of lists (useful for evaluation queries)
def flatten(S):
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

In [6]:
# Run chatbot models to store responses onto a predictions file, so that they can be loaded quickly when computing metrics
if do_predictions:
    print("Saving predictions to file")
    with tqdm(total=len(characters)*4) as pbar:
        # Responses of chatbot of a character on their own dataset
        for char in characters:
            checkpoint_folder = os.path.join(in_folder, char,
                                             character_dict[char]['checkpoint_folder'])
            model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
            model.compile()
            samples = load_char_df(char)
            for gen_type in ['greedy', 'nbeams', 'sampling']:
                get_chatbot_predictions(samples['test']['context/0'], model,
                              character_dict[char]['prediction_filename'] + '_' + gen_type + '.json',
                              gen_type, char, cache.tokenizer, base_folder, override_predictions=True)
                pbar.update(1)
        # Responses of default dialogpt on each character's dataset
        for char in characters:
            model = TFAutoModelForCausalLM.from_pretrained(model_name,
                                                           cache_dir=os.path.join(base_folder, "cache"))
            model.compile()
            samples = load_char_df(char)
            get_chatbot_predictions(samples['test']['context/0'], model,
                              'from_' + char + "_df__sampling.json", gen_type,
                              "Default", cache.tokenizer, base_folder, override_predictions=True)
            pbar.update(1)

In [7]:
# Train metrics (only has to be done once)
if do_metric_training:
    print("Training metrics")
    # Neural Chatbot Classifier training
    with tqdm(total=len(characters) + 2) as pbar:
        for char in tqdm(characters):
            neural_classifier = BBMetric.load_metric("neural chatbot classifier")
            neural_classifier.train(character=char, random_state=random_state,
                     source_encoded_path=None,
                     source_path=os.path.join(base_folder, "Data", "Sources",
                                              character_dict[char]['source'],
                                              character_dict[char]['source'] + ".csv"),
                     source_save_path=os.path.join(base_folder, "Data", "Characters", char),
                     save_path=os.path.join(base_folder, "Data", "Characters", char))
            pbar.update(1)
        # Distilbert-Embedded Chatbot Classifier training
        bertembedded_classifier = BBMetric.load_metric("distilbert-embedded chatbot classifier")
        bertembedded_classifier.train(characters_path=os.path.join(base_folder, "Data", "Characters"),
                                      save_path=os.path.join(base_folder, "Data", "Metrics", 
                                                             "distilbert_embedder"),
                                      train_embedder=True,
                                      verbose=True)
        pbar.update(1)
        # Also train the classifier without Barney
        characters_no_barney = characters.copy()
        characters_no_barney.remove("Barney")
        bertembedded_classifier = BBMetric.load_metric("distilbert-embedded chatbot classifier")
        bertembedded_classifier.metric.set_characters(characters_no_barney)
        bertembedded_classifier.train(characters_path=os.path.join(base_folder, "Data", "Characters"),
                                      save_path=os.path.join(base_folder, "Data", "Metrics", 
                                                             "distilbert_embedder_nobarney"),
                                      train_embedder=True,
                                      verbose=True)
        pbar.update(1)
else:
    print("Skipping Metrics training.")

Skipping Metrics training.


# Cache System Creation

In [8]:
# Define a cache structure to avoid reloading models and predictions
from types import SimpleNamespace

# Structure of the cache
cache = {
    'dialogpt': {char: None for char in characters + ["Base"]},
    'tokenizer': None,
    'datacollator': None,
    'trained_metric': {
        'neural chatbot classifier': {char: None for char in characters},
        'frequency chatbot classifier': {'c-tf-idf': None, 'tf-idf': None, 'word frequency': None},
        'distilbert-embedded chatbot classifier': {'Full': None, 'No Barney': None}
    },
    'testset': {char + "_df": None for char in characters + ["Common"]},
    'concat_and_encoded_testset': {char + "_df": None for char in characters + ["Common"]},
    'predictions': {
        char + "_df": { # Dataset
            char: { # Chatbot
                'greedy': None,
                'nbeams': None,
                'sampling': None
            } for char in characters + ["Base"]
        } for char in characters + ["Common"]
    },
}
cache = SimpleNamespace(**cache)

# Simple function to load an entry into the cache, if it is not present, otherwise just return it
def load_cache_entry(value, entry):
    pointer = cache
    for i in range(len(entry)-1):
        val = entry[i]
        if isinstance(pointer, dict):
            pointer = pointer[val]
        elif isinstance(pointer, SimpleNamespace):
            pointer = pointer.__dict__[val]
        else:
            raise Exception()
    if not pointer[entry[-1]]:
        pointer[entry[-1]] = value
        if verbose:
            print("Loaded cache at " + str(entry))
    return pointer[entry[-1]]

# Simple function to remove a cache entry reference, to avoid memory overloading
def flush_cache_entries(entries):
    for entry in entries:
        pointer = cache
        for i in range(len(entry)-1):
            val = entry[i]
            if isinstance(pointer, dict):
                pointer = pointer[val]
            elif isinstance(pointer, SimpleNamespace):
                pointer = pointer.__dict__[val]
            else:
                raise Exception()
        pointer[entry[-1]] = None
        if verbose:
            print("Flushed cache at " + str(entry))

In [9]:
# Function to get the testset (context, label) for a given character from cache
def get_cache_testset(character, base_folder):
    if not cache.testset[character + "_df"]:
        # Load the testset of a character
        if character != "Common":
            df = load_char_df(character, base_folder)['test']
        else: # Load the common dataset
            df = load_dataset('csv',
                     data_files=os.path.join(base_folder, 'Data', 'Sources', 'common_dataset.csv'), 
                     cache_dir=os.path.join(base_folder, "cache"))['train']
        load_cache_entry(df, ['testset', character + "_df"])
    return cache.testset[character + "_df"]

# Function to get the encoded and concatenated contexts/labels, used for perplexity
def get_cache_concat_and_encoded_testset(character, base_folder):
    if not cache.concat_and_encoded_testset[character + "_df"]:
        testset = get_cache_testset(character, base_folder)
        concat_encoded_testset = testset.map(lambda row: dialogpt_preprocess_function(row,
                                                                            cache.tokenizer),
                                             batched=False)
        concat_encoded_testset = concat_encoded_testset.to_tf_dataset(
            columns=["input_ids", "attention_mask", "labels"],
            shuffle=False,
            batch_size=8,
            collate_fn=cache.datacollator,
        )
        load_cache_entry(concat_encoded_testset, ['concat_and_encoded_testset', character + "_df"])
    return cache.concat_and_encoded_testset[character + "_df"]

# Function to get the responses of a chatbot to contexts, given a character and the context source (dataset_from)
def get_cache_predictions(dataset_from, character, base_folder, gen_type):
    if not cache.predictions[dataset_from][character][gen_type]:
        # If the dataset is that of a character, compute the predictions for character (or base dialogpt) on that dataset
        if dataset_from == character + "_df":
            if character != "Base":
                predictions_tk = get_chatbot_predictions(None, None,
                      character_dict[character]['prediction_filename'] + '_' + gen_type + '.json',
                      None, character, None, base_folder, override_predictions=False)
            else:
                predictions_tk = get_chatbot_predictions(None, None,
                      'from_' + dataset_from + '__' + gen_type + '.json',
                      None, 'Default', None, base_folder, override_predictions=False)
        # Otherwise, load the common dataset and do the same
        elif dataset_from == "Common_df":
            df = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'Sources', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))
            df = df.remove_columns(['source'])
            model = get_cache_model(character)
            predictions_tk = get_chatbot_predictions(df['train']['context/0'], model,
                  "", gen_type, character, cache.tokenizer, base_folder, file_caching=False, override_predictions=False)            
        else: # Other cases are not supported
            raise NotImplementedError("Unexpected predictions to load!")
        # Given the predictions, decode them and load them into cache
        predictions = []
        for line in predictions_tk:
            predictions.append(cache.tokenizer.decode(line, skip_special_tokens=True))
        load_cache_entry(predictions, ['predictions', dataset_from, character, gen_type])
    return cache.predictions[dataset_from][character][gen_type]

# Function to cache metrics. Most are just loaded normally through BBMetrics.load_metric, while others are also
# prepared for use (e.g. classifiers)
def get_cache_metric(metric_name, **kwargs):
    # Possible args
    classifier_char = None if 'classifier_char' not in kwargs else kwargs['classifier_char']
    mode = None if 'mode' not in kwargs else kwargs['mode']
    with_barney = None if 'with_barney' not in kwargs else kwargs['with_barney']
    with_barney = 'Full' if with_barney else 'No Barney'
    # If the metric is one of those requiring caching...
    if metric_name in cache.trained_metric:
        if metric_name == "neural chatbot classifier":
            if not cache.trained_metric[metric_name][classifier_char]:
                # Load the neural classifier
                cache.trained_metric[metric_name][classifier_char] = BBMetric.load_metric(metric_name)
                # Perform a dummy round for caching all internal metric loadables
                cache.trained_metric[metric_name][classifier_char].compute( 
                    character=classifier_char,
                    load_path=os.path.join(base_folder, "Data", "Characters",
                              classifier_char, character_dict[classifier_char]['classifier_folder']),
                    sentences=["Hi", "Hello", "How"])
            return cache.trained_metric[metric_name][classifier_char]
        elif metric_name == "frequency chatbot classifier":
            if not cache.trained_metric[metric_name][mode]:
                # Load the frequency classifier
                cache.trained_metric[metric_name][mode] = BBMetric.load_metric(metric_name)
                # Train it on the given mode (usually c-tf-idf)
                cache.trained_metric[metric_name][mode].train(
                    characters_path=os.path.join(base_folder, "Data", "Characters"),
                    mode=mode)
            return cache.trained_metric[metric_name][mode]
        elif metric_name == "distilbert-embedded chatbot classifier":
            if not cache.trained_metric[metric_name][with_barney]:
                # Load and train (KNN only, since the embedder is loaded from file) the distilbert classifier
                # Both the "with Barney" and "without Barney" options can be loaded
                if with_barney == 'Full':
                    cache.trained_metric[metric_name][with_barney] = BBMetric.load_metric(metric_name,
                                embedder_path=os.path.join(base_folder, "Data", "Metrics", 
                                                           "distilbert_embedder"),
                                from_pretrained=True, use_cuda=use_cuda)
                    cache.trained_metric[metric_name][with_barney].train(
                        characters_path=os.path.join(base_folder, "Data", "Characters"),
                        save_path=None, train_embedder=False
                    )
                else:
                    cache.trained_metric[metric_name][with_barney] = BBMetric.load_metric(metric_name,
                                embedder_path=os.path.join(base_folder, "Data", "Metrics", 
                                                           "distilbert_embedder_nobarney"),
                                from_pretrained=True, use_cuda=use_cuda)
                    cache.trained_metric[metric_name][with_barney].train(
                        characters_path=os.path.join(base_folder, "Data", "Characters"),
                        save_path=None, train_embedder=False
                    )
            return cache.trained_metric[metric_name][with_barney]
    else: # If the metric does not require caching, simply load it from BBMetrics
        return BBMetric.load_metric(metric_name)

# Function to load a DialoGPT model into cache and retrieve it
def get_cache_model(character):
    # Load either the base model or one of the characters fine-tuned models
    if character == "Base":
        model = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
    else:
        checkpoint_folder = os.path.join(in_folder, character, character_dict[character]['checkpoint_folder'])
        model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
    # Compile the model and store it in cache, return
    model.compile()
    cache.dialogpt[character] = model
    return cache.dialogpt[character]

# Finally, store in cache also the tokenizer and datacollator, since they are frequently used to prepare data
cache.tokenizer = tokenizer
cache.datacollator = data_collator

# Evaluation Process Definition

In [10]:
# Function to extract the correct list of strings to pass onto a metric, given the context show, the chatbot and the required
# column (e.g. context, label or predictions)
def sentence_callable(reference_set, character, column):
    # If we ask for context or labels, return data from the testset
    if column == "context/0" or column == "response":
        assert(reference_set == character + "_df")
        return get_cache_testset(character, base_folder)[column]
    else: # If we ask for predictions, return data from the cached predictions (on the appropriate dataset)
        assert(reference_set == character + "_df" or \
               reference_set == "Common_df" or \
               (character == "Base" and column == "sampling"))
        return get_cache_predictions(reference_set, character, base_folder, column)

# Function to extract the correct data necessary by perplexity (model and specially encoded testset)
def perplexity_callable(reference_set, character):
    return {
        'model': get_cache_model(character),
        'encoded_test_set': get_cache_concat_and_encoded_testset(reference_set.replace("_df", ""),
                                                                 base_folder)
    }

In [11]:
# Define the evaluation procedure, taking in input a list of dictionaries, each acting as a query
def evaluate_round(queries):
    # Internal maps from enum to string, for proper interpretation of the query into function args
    actors_pprint_map = {
        MetricActor.DATASET_CHAR: "dataset",
        MetricActor.DATASET_CHARCONTEXT: "dataset labels",
        MetricActor.DIALOGPT_GREEDY: "dialogpt (greedy)",
        MetricActor.DIALOGPT_NBEAMS: "dialogpt (nbeamns)",
        MetricActor.DIALOGPT_SAMPLE: "dialogpt (sampling)"
    }
    actor_to_column_map = {
        MetricActor.DATASET_CHARCONTEXT: 'context/0',
        MetricActor.DATASET_CHAR: 'response',
        MetricActor.DIALOGPT_GREEDY: 'greedy',
        MetricActor.DIALOGPT_NBEAMS: 'nbeams',
        MetricActor.DIALOGPT_SAMPLE: 'sampling'
    }
    # Store query results here
    results = dict()
    # Iterate over queries
    for i in range(len(queries)):
        try:
            query = queries[i].copy() # Since there are destructive operations
            print("#### Running Query " + str(i+1) + "/" + str(len(queries)) + " ####")
            # Query requires to run an operation (flush the cache, generally)
            if 'run' in query:
                query['run'](**query['run_args'])
            # Otherwise, it is a metric-computing query
            else:
                # Print some info to user
                print("Evaluating " + query['metric_name'] + \
                      " on reference set " + query['reference_set'] + " with:")
                for actor_type, actor in query['metric_actors'].items():
                    print("\t" + actor[1] + " " + actors_pprint_map[actor[0]] + " as " + actor_type)
                # Get metric metadata data for outputting
                query_output = dict()
                query_output['metric_name'] = query['metric_name']
                query_output['metric_version'] = 1 if 'metric_version' not in query else query['metric_version']
                query_output['metric_attempt'] = 0 if 'metric_attempt' not in query else query['metric_attempt']
                query_output['metric_actors'] = query['metric_actors']
                query_output['metric_params'] = query['metric_params']
                query_output['context'] = {
                    "dialogpt_size": "small",
                    "dialogpt_context_sentences": BBData.context_n,
                    "dialogpt_nbeams_beams": BBData.n_beams,
                    "dialogpt_sample_top_p": BBData.top_p,
                    "dialogpt_sample_top_k": BBData.top_k
                }
                query_output['metric_arity'] = get_metric_arity(query['metric_name'])
                query_output['metric_determinism'] = get_metric_determinism(query['metric_name'],
                                                                            query_output['metric_version'])
                query_output['reference_set'] = query['reference_set']
                query_hash = dict_hash({'metric_name': query_output['metric_name'],
                                        'metric_version': query_output['metric_version'],
                                        'reference_set': query_output['reference_set'],
                                        'metric_attempt': query_output['metric_attempt'],
                                        'metric_actors': query_output['metric_actors'],
                                        'context': query_output['context'],
                                        'metric_params': query_output['metric_params']})
                # This is a lazy fix to remove the "_df" suffix used in query, but not used in some functions
                for key in query['metric_actors'].keys():
                    if query['metric_actors'][key][0] == MetricActor.DATASET_CHARCONTEXT or \
                        query['metric_actors'][key][0] == MetricActor.DATASET_CHAR:
                        query['metric_actors'][key] = (query['metric_actors'][key][0],
                                                       query['metric_actors'][key][1].replace("_df", ""))
                # Get the parameters for the actual metric computation: some cases depending on the specific query
                if query['metric_name'] in ['google bleu', 'meteor', 'rouge l', 'mpnet embedding similarity',
                                'emotion classifier', 'distinct', 'roberta crossencoding similarity',
                                'repetitiveness', 'translation error rate', 'bertscore', 'bleurt', 'bartscore',
                                'word mover distance', 't5 grammar correction edit distance',
                                'extended edit distance', 'flesch-kincaid index']:
                    # For most, simply use the correct arg name and fetch the list of strings
                    args_map = {
                        'predictor': 'predictions', 'reference': 'references', 'document': 'sentences',
                        'document0': 'sentences_a', 'document1': 'sentences_b'
                    }
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = {}
                    for actor_key, actor_pair in query['metric_actors'].items():
                        args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                           actor_pair[1],
                                                                           actor_to_column_map[actor_pair[0]])
                # For COMET, also get the proper args, but naming is slightly different
                elif query['metric_name'] == 'comet':
                    args_map = {
                        'predictor': 'predictions', 'reference': 'references', 'document': 'sources'
                    }
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = {}
                    for actor_key, actor_pair in query['metric_actors'].items():    
                        args_dict[args_map[actor_key]] = sentence_callable(query['reference_set'],
                                                                           actor_pair[1],
                                                                           actor_to_column_map[actor_pair[0]])
                # For perplexity, the special "perplexity_callable" is used to fetch the args, since it requires the model
                elif query['metric_name'] in ['perplexity']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'])
                    args_dict = perplexity_callable(query['reference_set'],
                                                    actor_pair[1])
                # For frequency classifier, handle the custom param "mode"
                elif query['metric_name'] in ['frequency chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'],
                                              mode=query['metric_params']['mode'])
                    del query['metric_params']['mode']
                    args_dict = {
                        'sentences': sentence_callable(query['reference_set'],
                                                       actor_pair[1],
                                                       actor_to_column_map[actor_pair[0]])
                    }
                # For distilbert classifier, handle the custom param "with_barney"
                elif query['metric_name'] in ['distilbert-embedded chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    metric = get_cache_metric(query['metric_name'],
                                              with_barney=query['metric_params']['with_barney'])
                    del query['metric_params']['with_barney']
                    args_dict = {
                        'sentences': sentence_callable(query['reference_set'],
                                                       actor_pair[1],
                                                       actor_to_column_map[actor_pair[0]])
                    }
                # For neural classifier, handle the custom param "classifier_char"
                elif query['metric_name'] in ['neural chatbot classifier']:
                    actor_pair = list(query['metric_actors'].values())[0]
                    classifier_char = query['metric_params']['classifier_char']
                    args_dict = {
                        'character': classifier_char,
                        'load_path': os.path.join(base_folder, "Data", "Characters",
                                      classifier_char, character_dict[classifier_char]['classifier_folder']),
                    }
                    metric = get_cache_metric(query['metric_name'],
                                              classifier_char=classifier_char)
                    del query['metric_params']['classifier_char']
                    args_dict['sentences'] = sentence_callable(query['reference_set'],
                                                               actor_pair[1],
                                                               actor_to_column_map[actor_pair[0]])
                # Finally, compute the actual metric on the args
                query_output['answer'] = metric.compute(**{**args_dict, **query['metric_params']})
                results[query_hash] = query_output
        # If a query fails, do not interrupt the whole operation, just print the failure and proceed to the next query
        except Exception as e:
            print("Query failed due to " + str(type(e)) + " with message " + str(e))
        print()
    print("Done.")
    # Return the list of results
    return results

# Example of Running an Evaluation

In [13]:
# Metric Name: See BBMetric.metrics_list
# Metric Params: See optional and require params of each metric
## NOTE: For neural chatbot classifier, add 'classifier_char' as a parameter
# Metric Actors:
## DATASET_CHARCONTEXT: (any character | "Common") + "_df"
## DATASET_CHAR: (any character | "Common") + "_df"
## DIALOGPT_GREEDY: any character | "Base"
## DIALOGPT_NBEAMS: any character | "Base"
## DIALOGPT_SAMPLE: any character | "Base"
# Reference Set: (any character | "Common") + "_df"
# Metric Attempt: Defaults to 0, add a number to save multiple runs of the same query

queries = [
    {
        'metric_name': 'google bleu',
        'metric_actors': {
            'predictor': (MetricActor.DATASET_CHAR, 'Vader_df'),
            'reference': (MetricActor.DATASET_CHARCONTEXT, 'Vader_df'),
        },
        'reference_set': 'Vader_df',
        'metric_params': {},
        'metric_attempt': 0
    }
]

In [14]:
#evaluate_round(queries)

# Run Evaluations

Here we use the above code to compute the actual metrics. Several variations on this are possible, as the system is made to be quite flexible.

## 10 Sentences Test

In [15]:
test_10 = dict()

# The test is performed over Barney and Vader
for char in ['Barney', 'Vader']:
    # Load the testset for the correct character
    if char == 'Barney':
        test_10[char + '_context'] = get_cache_testset('Common', base_folder)['context/0'][0]
        test_10[char + '_label'] = get_cache_testset('Common', base_folder)['response'][0]
    elif char == 'Vader':
        test_10[char + '_context'] = get_cache_testset('Common', base_folder)['context/0'][20]
        test_10[char + '_label'] = get_cache_testset('Common', base_folder)['response'][20]
    # Load the model for the correct character
    test_10[char + '_responses'] = list()
    test_10[char + '_model'] = None
    #test_10[char + '_model'] = get_cache_model(char)
    '''
    for i in tqdm(range(10)):
        tokenized_question = cache.tokenizer.encode(test_10[char + '_context'] + cache.tokenizer.eos_token,
                                                    return_tensors='tf')
        max_length = 128 + tokenized_question.shape[1]
        generated_answer = test_10[char + '_model'].generate(
                            tokenized_question,
                            pad_token_id=cache.tokenizer.eos_token_id,
                            max_length=max_length,
                            do_sample=True,
                            top_k=BBData.top_k,
                            top_p=BBData.top_p)[0].numpy().tolist()
        generated_answer = generated_answer[len(tokenized_question[0]):]
        test_10[char + '_responses'].append(cache.tokenizer.decode(generated_answer, skip_special_tokens=True))
    '''
    # (Precomputed responses for repeatability. The above code can be uncommented to run a new test)
    test_10['Barney_responses'] = ['No, this is about believing that you can rule the galaxy by making all the wrong decisions for your lover girlfriend.', 'Barney, you are a disgrace.', "That's not the point. You're trying to fill a hole in the story of a really big problem that I've been trying to solve. Now, where are you going to put the tape?", "I'm pretty sure it's about a girl.", 'Why do I keep doing that? I mean, who is this guy?', 'I am not sure I like this story.', 'No, this is about the girl.', "We're not going to get that.", 'No, this is about the building.', "My building? It is on the property of a small local company. It was run by a woman. This woman's website said it's a women's only club."]
    test_10['Vader_responses'] = ['What is your destiny.', 'I will not', 'That is not a true statement.', 'We will not fight you. If you fight us.', "That's why you fail.", 'You have not done well. Perhaps we should have a surprise meet.', 'The Rebellion will not fight you, Commander. We will not fight you. The Empire has not broken you. The Empire has only broken your faith. You will not destroy us.', 'I promise you my word.', "Don't do this to me.", 'Lord Vader, I suggest you not try to keep this ship afloat. This ship has been constructed in perfect condition. And your men have only taken a few months to fix the damages.']

Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.45it/s]

Loaded cache at ['testset', 'Common_df']





In [16]:
# Print the context and 10 generated responses
for char in ['Barney', 'Vader']:
    print(char.upper())
    print("\t-Context-")
    print("\t\t* " + test_10[char + '_context'])
    print("\t-Answers-")
    for elem in test_10[char + '_responses']:
        print('\t\t* ' + elem)
    print()

BARNEY
	-Context-
		* Barney, this is about the building.
	-Answers-
		* No, this is about believing that you can rule the galaxy by making all the wrong decisions for your lover girlfriend.
		* Barney, you are a disgrace.
		* That's not the point. You're trying to fill a hole in the story of a really big problem that I've been trying to solve. Now, where are you going to put the tape?
		* I'm pretty sure it's about a girl.
		* Why do I keep doing that? I mean, who is this guy?
		* I am not sure I like this story.
		* No, this is about the girl.
		* We're not going to get that.
		* No, this is about the building.
		* My building? It is on the property of a small local company. It was run by a woman. This woman's website said it's a women's only club.

VADER
	-Context-
		* I will not fight you.
	-Answers-
		* What is your destiny.
		* I will not
		* That is not a true statement.
		* We will not fight you. If you fight us.
		* That's why you fail.
		* You have not done well. Perhaps we s

In [17]:
metric_filename = "10 Sentences Ranking.json"

# Prepend some extra data to the metric file
metric_dict = {
    'test_additional_data' : {
        'generated_sentences': {
            'Barney': test_10['Barney_responses'],
            'Vader': test_10['Vader_responses']
        }
    },
    'human_ranking': {
        'Barney': [28, 4, 20, 27, 9, 16, 21, 9, 12, 19],
        'Vader': [19, 15, 19, 21, 30, 7, 15, 15, 19, 5]
    }
}

# Compute the 10 sentences ranking. This code is very similar to the function "evaluate_round", and is necessary since
# the test is not computed on one of the standard datasets
for char in ['Vader', 'Barney']:
    for metric_name in ['distinct']:
        print("Computing " + metric_name + " for character " + char)
        query_output = dict()
        query_output['metric_name'] = metric_name
        query_output['metric_version'] = 1
        query_output['metric_attempt'] = 0
        query_output['context'] = {
            "dialogpt_size": "small",
            "dialogpt_context_sentences": BBData.context_n,
            "dialogpt_nbeams_beams": BBData.n_beams,
            "dialogpt_sample_top_p": BBData.top_p,
            "dialogpt_sample_top_k": BBData.top_k
        }
        query_output['metric_arity'] = get_metric_arity(metric_name)
        query_output['metric_determinism'] = get_metric_determinism(metric_name, 1)
        query_output['reference_set'] = [test_10[char + '_context']]
        if metric_name in ['bartscore', 'rouge l', 'google bleu', 'meteor', 'bertscore', 'bleurt', 'translation error rate',
                           'bleurt', 'bertscore', 'roberta crossencoding similarity']:
            metric = get_cache_metric(metric_name)
            metric_params = dict()
            query_output['metric_actors'] = {
                'predictor': (MetricActor.DIALOGPT_SAMPLE, char),
                'reference': [test_10[char + '_label']]
            }
            compute_args = [{
                'predictions': sentence,
                'references': test_10[char + '_label']
            } for sentence in test_10[char + '_responses']]
        elif metric_name in ['extended edit distance', 'word mover distance', 'mpnet embedding similarity']:
            metric = get_cache_metric(metric_name)
            metric_params = dict()
            query_output['metric_actors'] = {
                'document0': (MetricActor.DIALOGPT_SAMPLE, char),
                'document1': [test_10[char + '_label']]
            }
            compute_args = [{
                'sentences_a': sentence,
                'sentences_b': test_10[char + '_label']
            } for sentence in test_10[char + '_responses']]
        elif metric_name in ['frequency chatbot classifier', 'emotion classifier', 'distilbert-embedded chatbot classifier',
                             'distinct', 'repetitiveness', 't5 grammar correction edit distance', 'flesch-kincaid index']:
            if metric_name == 'frequency chatbot classifier':
                metric = get_cache_metric(metric_name, mode='c-tf-idf')
                metric_params = {'mode': 'c-tf-idf'}
            elif metric_name == 'distilbert-embedded chatbot classifier':
                metric = get_cache_metric(metric_name, with_barney=True)
                metric_params = {'with_barney': True, 'count_neighbors': True}
            else:
                metric = get_cache_metric(metric_name)
                metric_params = dict()
            query_output['metric_actors'] = {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            }
            compute_args = [{
                'sentences': sentence
            } for sentence in test_10[char + '_responses']]
        elif metric_name == 'perplexity':
            print("Skipping Perplexity.")
            continue
        elif metric_name == 'neural chatbot classifier':
            metric = get_cache_metric(metric_name, classifier_char=char)
            metric_params = {'classifier_char': char}
            query_output['metric_actors'] = {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            }
            responses_n = len(test_10[char + '_responses'])
            compute_args = [{
                'sentences': [test_10[char + '_responses'][i],
                              test_10[char + '_responses'][(i+1) % responses_n],
                              test_10[char + '_responses'][(i+2) % responses_n]],
                'load_path': os.path.join(base_folder, "Data", "Characters", char, character_dict[char]['classifier_folder']),
                'character': char
            } for i in range(responses_n)]
        elif metric_name == 'comet':
            metric = get_cache_metric(metric_name)
            metric_params = dict()
            query_output['metric_actors'] = {
                'document': [test_10[char + '_context']],
                'predictor': (MetricActor.DIALOGPT_SAMPLE, char),
                'reference': [test_10[char + '_label']]
            }
            compute_args = [{
                'sources': test_10[char + '_context'],
                'predictions': sentence,
                'references': test_10[char + '_label']
            } for sentence in test_10[char + '_responses']]
        query_output['metric_params'] = metric_params
        query_hash = dict_hash({'metric_name': query_output['metric_name'],
                        'metric_version': query_output['metric_version'],
                        'reference_set': query_output['reference_set'],
                        'metric_attempt': query_output['metric_attempt'],
                        'metric_actors': query_output['metric_actors'],
                        'context': query_output['context'],
                        'metric_params': query_output['metric_params']})
        results = [metric.compute(**args) for args in compute_args]
        query_output['answer'] = results
        metric_dict = {**metric_dict, **{query_hash: query_output}}
        save_metric_by_name(os.path.join(out_folder, 'Advanced Tests'), metric_filename, metric_dict)

Computing distinct for character Vader
Computing distinct for character Barney


## Single Metrics

In [None]:
# Testing on metrics with arity 1
for metric in ['distinct', 'repetitiveness', 't5 grammar correction edit distance', 'flesch-kincaid index', 
               'frequency chatbot classifier', 'emotion classifier']:
    metric_pretty = BBMetric.load_metric(metric).pretty_name
    metric_params = dict()
    if metric == "distilbert-embedded chatbot classifier":
        metric_params = {'with_barney': True}
    elif metric == "frequency chatbot classifier":
        metric_params = {'mode': 'c-tf-idf'}
    results = evaluate_round([
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DATASET_CHAR, char + '_df')
            },
            'reference_set': char + '_df',
            'metric_params': metric_params.copy(),
            'metric_attempt': 0
        } for char in characters + ["Common"]
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': metric_params.copy(),
            'metric_attempt': 0
        } for char in characters
    ] + [
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': 'Common_df',
            'metric_params': metric_params.copy(),
            'metric_attempt': 0
        } for char in characters + ["Base"]
    ])
    metric_dict = load_metric_by_name(out_folder, metric_pretty)
    metric_dict = {**metric_dict, **results}
    save_metric_by_name(out_folder, metric_pretty, metric_dict)

#### Running Query 1/26 ####
Evaluating distinct on reference set Barney_df with:
	Barney_df dataset as document


Using custom data configuration default-d05c63f64527f593
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.71it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-e57fb526fe4a3ff3.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-d05c63f64527f593/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-54c202230f70778c.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Barney_df']

#### Running Query 2/26 ####
Evaluating distinct on reference set Sheldon_df with:
	Sheldon_df dataset as document


Using custom data configuration default-b47497d241584694
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.19it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-029452f8e061c873.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-b47497d241584694/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-c14841bc6fd888d0.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Sheldon_df']

#### Running Query 3/26 ####
Evaluating distinct on reference set Harry_df with:
	Harry_df dataset as document


Using custom data configuration default-8734ab070ca4d4a4
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.64it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-6f2c723a222e2152.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-8734ab070ca4d4a4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-06309c8f05ebbafb.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Harry_df']

#### Running Query 4/26 ####
Evaluating distinct on reference set Fry_df with:
	Fry_df dataset as document


Using custom data configuration default-edae583082198a82
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.34it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-3a40bd254dc1230b.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-edae583082198a82/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-76bce446c8545bcd.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Fry_df']

#### Running Query 5/26 ####
Evaluating distinct on reference set Bender_df with:
	Bender_df dataset as document


Using custom data configuration default-36d673def5b55b14
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 31.91it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-45e9a5cdc4703907.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-36d673def5b55b14/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-3d73977ab48375f4.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Bender_df']

#### Running Query 6/26 ####
Evaluating distinct on reference set Vader_df with:
	Vader_df dataset as document


Using custom data configuration default-3e37c23a51e9d556
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.65it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-6affe80547fc0f38.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-3e37c23a51e9d556/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-6800bd34204a2976.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Vader_df']

#### Running Query 7/26 ####
Evaluating distinct on reference set Joey_df with:
	Joey_df dataset as document


Using custom data configuration default-bc30ecf942c9a05e
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.78it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-ba044a5350a0b4ee.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-bc30ecf942c9a05e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-27a2b109b2a74d44.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Joey_df']

#### Running Query 8/26 ####
Evaluating distinct on reference set Phoebe_df with:
	Phoebe_df dataset as document


Using custom data configuration default-437509a5e3c6484b
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.99it/s]
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-f3dedb84963f27fc.arrow and E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/Src/cache/csv/default-437509a5e3c6484b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-f69b60477b60f2fe.arrow
Loading cached split indices for dataset at E:/University/Esami da Superare/Natural Language Processing/BarneyB

Loaded cache at ['testset', 'Phoebe_df']

#### Running Query 9/26 ####
Evaluating distinct on reference set Common_df with:
	Common_df dataset as document

#### Running Query 10/26 ####
Evaluating distinct on reference set Barney_df with:
	Barney dialogpt (sampling) as document
Loading predictions from stored file
Loaded predictions from stored file
Loaded cache at ['predictions', 'Barney_df', 'Barney', 'sampling']

#### Running Query 11/26 ####
Evaluating distinct on reference set Sheldon_df with:
	Sheldon dialogpt (sampling) as document
Loading predictions from stored file
Loaded predictions from stored file
Loaded cache at ['predictions', 'Sheldon_df', 'Sheldon', 'sampling']

#### Running Query 12/26 ####
Evaluating distinct on reference set Harry_df with:
	Harry dialogpt (sampling) as document
Loading predictions from stored file
Loaded predictions from stored file
Loaded cache at ['predictions', 'Harry_df', 'Harry', 'sampling']

#### Running Query 13/26 ####
Evaluating distinct on

Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.01it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Barney\barney_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transfo

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:17<09:54, 17.49s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:22<05:39, 10.28s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:24<03:29,  6.54s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Barney', 'sampling']

#### Running Query 19/26 ####
Evaluating distinct on reference set Common_df with:
	Sheldon dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Sheldon\sheldon_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Trans

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:03<01:46,  3.13s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:05<01:28,  2.67s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:08<01:32,  2.89s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Sheldon', 'sampling']

#### Running Query 20/26 ####
Evaluating distinct on reference set Common_df with:
	Harry dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 200.16it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Harry\harry_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transform

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:04<02:25,  4.27s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:06<01:43,  3.14s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:08<01:27,  2.74s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Harry', 'sampling']

#### Running Query 21/26 ####
Evaluating distinct on reference set Common_df with:
	Fry dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Fry\fry_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers!

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:01<00:58,  1.73s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:06<01:49,  3.32s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:09<01:49,  3.43s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Fry', 'sampling']

#### Running Query 22/26 ####
Evaluating distinct on reference set Common_df with:
	Bender dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 248.10it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Bender\bender_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transfo

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:08<04:54,  8.67s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:10<02:37,  4.76s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:12<01:49,  3.41s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Bender', 'sampling']

#### Running Query 23/26 ####
Evaluating distinct on reference set Common_df with:
	Vader dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Vader\vader_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transform

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:01<00:36,  1.07s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:03<01:04,  1.96s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:06<01:16,  2.40s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Vader', 'sampling']

#### Running Query 24/26 ####
Evaluating distinct on reference set Common_df with:
	Joey dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Joey\joey_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformer

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:05<02:58,  5.24s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:12<03:27,  6.30s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:15<02:29,  4.67s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Joey', 'sampling']

#### Running Query 25/26 ####
Evaluating distinct on reference set Common_df with:
	Phoebe dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.02it/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Phoebe\phoebe_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transfo

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:12<07:06, 12.53s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:20<05:17,  9.61s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:22<03:15,  6.11s/it]A dec

Loaded cache at ['predictions', 'Common_df', 'Phoebe', 'sampling']

#### Running Query 26/26 ####
Evaluating distinct on reference set Common_df with:
	Base dialogpt (sampling) as document


Using custom data configuration default-9ad41ba7d956ebd3
Found cached dataset csv (E:/University/Esami da Superare/Natural Language Processing/BarneyBotGit/BarneyBot/cache/csv/default-9ad41ba7d956ebd3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.74it/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 498M/498M [01:30<00:00, 5.49MB/s]
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at microsoft/DialoGPT-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train Te

Creating predictions


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|██▎                                                                                | 1/35 [00:04<02:19,  4.11s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|████▋                                                                              | 2/35 [00:09<02:37,  4.76s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|███████                                                                            | 3/35 [00:14<02:36,  4.89s/it]A dec

In [None]:
# Testing on neural chatbot classifier
metric = 'neural chatbot classifier'
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round(flatten([[
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DATASET_CHAR, char + '_df')
            },
            'reference_set': char + '_df',
            'metric_params': {'classifier_char': char},
            'metric_attempt': 0
        },
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': char + '_df',
            'metric_params': {'classifier_char': char},
            'metric_attempt': 0
        },
        {
            'metric_name': metric,
            'metric_actors': {
                'document': (MetricActor.DIALOGPT_SAMPLE, char)
            },
            'reference_set': 'Common_df',
            'metric_params': {'classifier_char': char},
            'metric_attempt': 0
        },
        {
            'run': flush_cache_entries,
            'run_args': {
                'entries': [['trained_metric', 'neural chatbot classifier', char]]
            }
        }
] for char in characters
]))
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)

In [29]:
# Testing on perplexity
metric = 'perplexity'
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round([
    {
        'metric_name': metric,
        'metric_actors': {
            'predictor': (MetricActor.DIALOGPT_SAMPLE, charpair[0])
        },
        'reference_set': charpair[1] + '_df',
        'metric_params': {},
        'metric_attempt': 0
    } for charpair in [('Joey', 'Phoebe'), ('Joey', 'Sheldon'), ('Bender', 'Fry'), ('Bender', 'Barney'),
                       ('Barney', 'Harry')]
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': char + '_df',
        'metric_params': metric_params.copy(),
        'metric_attempt': 0
    } for char in characters
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, "Base")
        },
        'reference_set': char + '_df',
        'metric_params': metric_params.copy(),
        'metric_attempt': 0
    } for char in characters
] + [
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': 'Common_df',
        'metric_params': metric_params.copy(),
        'metric_attempt': 0
    } for char in characters + ["Base"]
])
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)

#### Running Query 1/9 ####
Evaluating perplexity on reference set Common_df with:
	Barney dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Barney\barney_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


  0%|          | 0/1 [00:00<?, ?it/s]

Loaded cache at ['testset', 'Common_df']
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\tfgpt2_main_layer
......vars
.........0
...layers\tfgpt2_main_layer\drop
......vars
...layers\tfgpt2_main_layer\h\tf_block
......vars
...layers\tfgpt2_main_layer\h\tf_block\attn
......vars
...layers\tfgpt2_main_layer\h\tf_block\attn\attn_dropout
......vars
...layers\tfgpt2_main_layer\h\tf_block\attn\c_attn
......vars
.........0
.........1
...layers\tfgpt2_main_layer\h\tf_block\attn\c_proj
......vars
.........0
.........1
...layers\tfgpt2_main_layer\h\tf_block\attn\resid_dropout
......vars
...layers\tfgpt2_main_layer\h\tf_block\ln_1
......vars
.........0
.........1
...layers\tfgpt2_main_layer\h\tf_block\ln_2
......vars
.........0
.........1
...layers\tfgpt2_main_layer\h\tf_block\mlp
......vars
...layers\tfgpt2_main_layer\h\tf_block\mlp\c_fc
......vars
.........0
.........1
...layers\tfgpt2_main_layer\h\tf_block\mlp\c_proj
......vars
.........0
.........1
...layers\tfgpt2_

Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-24 18:22:21         2297
metadata.json                                  2022-12-24 18:22:21           64
variables.h5                                   2022-12-24 18:22:24    498110488
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\tfgpt2_main_layer
......vars
.........0
...layers\tfgpt2_main_layer\drop
......vars
...layers\tfgpt2_main_layer\h\tf_block
......vars
...layers\tfgpt2_main_layer\h\tf_block\attn
......vars
...layers\tfgpt2_main_layer\h\tf_block\attn\attn_dropout
......vars
...layers\tfgpt2_main_layer\h\tf_block\attn\c_attn
......vars
.........0
.........1
...layers\tfgpt2_main_layer\h\tf_block\attn\c_proj
......vars
.........0
.........1
...layers\tfgpt2_main_layer\h\tf_block\attn\resid_dropout
......vars
...layers\tfgpt2_main_layer\h\tf_block\ln_1
......vars
.........0
.........1
...la

Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-24 18:22:30         2299
metadata.json                                  2022-12-24 18:22:30           64
variables.h5                                   2022-12-24 18:22:35    498110488


  0%|          | 0/35 [00:00<?, ?ex/s]

Loaded cache at ['concat_and_encoded_testset', 'Common_df']


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:58<00:00, 11.77s/it]



#### Running Query 2/9 ####
Evaluating perplexity on reference set Common_df with:
	Sheldon dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Sheldon\sheldon_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:58<00:00, 11.74s/it]



#### Running Query 3/9 ####
Evaluating perplexity on reference set Common_df with:
	Harry dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Harry\harry_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:56<00:00, 11.20s/it]



#### Running Query 4/9 ####
Evaluating perplexity on reference set Common_df with:
	Fry dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Fry\fry_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:51<00:00, 10.24s/it]



#### Running Query 5/9 ####
Evaluating perplexity on reference set Common_df with:
	Bender dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Bender\bender_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:50<00:00, 10.14s/it]



#### Running Query 6/9 ####
Evaluating perplexity on reference set Common_df with:
	Vader dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Vader\vader_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:53<00:00, 10.61s/it]



#### Running Query 7/9 ####
Evaluating perplexity on reference set Common_df with:
	Joey dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Joey\joey_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:51<00:00, 10.34s/it]



#### Running Query 8/9 ####
Evaluating perplexity on reference set Common_df with:
	Phoebe dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot\Data\Characters\Phoebe\phoebe_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:51<00:00, 10.27s/it]



#### Running Query 9/9 ####
Evaluating perplexity on reference set Common_df with:
	Base dialogpt (sampling) as document


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at microsoft/DialoGPT-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:50<00:00, 10.14s/it]


Done.





# COMET

In [22]:
# Testing on COMET
metric = "comet"
metric_pretty = BBMetric.load_metric(metric).pretty_name
metric_params = dict()
results = evaluate_round([
    {
        'metric_name': metric,
        'metric_actors': {
            'document': (MetricActor.DATASET_CHARCONTEXT, char + '_df'),
            'reference': (MetricActor.DATASET_CHAR, char + "_df"),
            'predictor': (MetricActor.DIALOGPT_SAMPLE, char)
        },
        'reference_set': char + '_df',
        'metric_params': {},
        'metric_attempt': 0
    } for char in characters
])
metric_dict = load_metric_by_name(out_folder, metric_pretty)
metric_dict = {**metric_dict, **results}
save_metric_by_name(out_folder, metric_pretty, metric_dict)

Downloading builder script:   0%|          | 0.00/5.79k [00:00<?, ?B/s]

eamt22-cometinho-da.tar.gz: 307MB [00:58, 5.25MB/s]                                                                    


Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]



#### Running Query 1/8 ####
Evaluating comet on reference set Barney_df with:
	Barney_df dataset labels as document
	Barney_df dataset as reference
	Barney dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



#### Running Query 2/8 ####
Evaluating comet on reference set Sheldon_df with:
	Sheldon_df dataset labels as document
	Sheldon_df dataset as reference
	Sheldon dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



#### Running Query 3/8 ####
Evaluating comet on reference set Harry_df with:
	Harry_df dataset labels as document
	Harry_df dataset as reference
	Harry dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



#### Running Query 4/8 ####
Evaluating comet on reference set Fry_df with:
	Fry_df dataset labels as document
	Fry_df dataset as reference
	Fry dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



#### Running Query 5/8 ####
Evaluating comet on reference set Bender_df with:
	Bender_df dataset labels as document
	Bender_df dataset as reference
	Bender dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



#### Running Query 6/8 ####
Evaluating comet on reference set Vader_df with:
	Vader_df dataset labels as document
	Vader_df dataset as reference
	Vader dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



#### Running Query 7/8 ####
Evaluating comet on reference set Joey_df with:
	Joey_df dataset labels as document
	Joey_df dataset as reference
	Joey dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



#### Running Query 8/8 ####
Evaluating comet on reference set Phoebe_df with:
	Phoebe_df dataset labels as document
	Phoebe_df dataset as reference
	Phoebe dialogpt (sampling) as predictor


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



Done.
