# Setup

In [38]:
import logging
import hashlib
import json
from enum import Enum
import os

import pprint
printer = pprint.PrettyPrinter(depth=4, width=300)

In [2]:
from Data.data_dicts import character_dict

def is_character(char):
    if char in character_dict.keys() and char != 'Default':
        return True
    elif char == 'Base' or char == 'Common':
        return False
    else:
        logging.error("Unknown character name " + char + "!")

In [3]:
# Mount google drive
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
    os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    # Install dependencies
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
# Set metrics folder for inputs
in_metrics_folder = os.path.join(base_folder, 'Metrics', 'New')
if not os.path.exists(in_metrics_folder):
    os.makedirs(in_metrics_folder)

# Set metrics folder for outputs
out_metrics_folder = os.path.join(base_folder, 'Metrics', 'New')
if not os.path.exists(out_metrics_folder):
    os.makedirs(out_metrics_folder)

In [4]:
# Store a json file
def save_as_json(filepath, filename, data):
    if not os.path.exists(filepath):
        os.makedirs(filepath, exist_ok=True)
    with open(os.path.join(filepath, filename + ".json"), 'w') as f:
        f.write(json.dumps(data, indent=4))

# Load a json file
def load_from_json(filepath, filename):
    if not os.path.exists(os.path.join(filepath, filename + '.json')):
        return dict()
    with open(os.path.join(filepath, filename + '.json'), 'r') as f:
        return json.load(f)

In [5]:
from typing import Dict, Any

# Compute a string hash from a (nested) dict
def dict_hash(dictionary: Dict[str, Any]) -> str:
    dhash = hashlib.md5()
    encoded = json.dumps(dictionary, sort_keys=True).encode()
    dhash.update(encoded)
    return dhash.hexdigest()

In [23]:
class MetricDependency(int, Enum):
    DATASET = 0      # Metric depends on datasets only and/or base DialoGPT model
    COHERENT = 1     # Metric depends on chatbot trained on its dataset
    ADVERSARIAL = 2  # Metric depends on chatbot trained on a dataset but with another dataset of reference
    COMPARATIVE = 3  # Metric depends on comparison between chatbots

class MetricArity(int, Enum):
    SINGLE = 1
    PAIRWISE = 2

class MetricDeterminism(int, Enum):
    DETERMINISTIC = 0 # There is a closed-form equation for this metric, which is fully computed
    PROBABILISTIC = 1 # The metric is obtained through explainable approx., e.g. SGD, partial computation on a subset...
    NEURAL = 2        # The metric is obtained via a neural network
    HUMAN = 4         # The metric is obtained via human surveying

class MetricActor(int, Enum):
    DATASET_CHARCONTEXT = 0     # any character but not 'Default', including "Common"
    DATASET_CHAR = 1   # any character but not 'Default', including "Common"
    DIALOGPT_GREEDY = 10  # any character including 'Base'
    DIALOGPT_NBEAMS = 11  # any character including 'Base'
    DIALOGPT_SAMPLE = 12  # any character including 'Base'

In [32]:
def get_metric_arity(metric_name):
    if metric_name == 'bleu' or metric_name == 'rouge l' or metric_name == 'semantic similarity' or \
       metric_name == 'semantic answer similarity' or metric_name == 'chatbot classifier' or metric_name == 'perplexity':
        return MetricArity.PAIRWISE
    elif metric_name == 'distinct' or metric_name == 'emotion classifier' or metric_name == 'lines count':
        return MetricArity.SINGLE
    elif metric_name == 'dummy metric':
        return MetricArity.PAIRWISE
    else:
        logging.error("Unknown arity for metric " + metric_name)
        
def get_metric_determinism(metric_name, metric_version):
    if metric_name == 'lines count' and metric_version == 1:
        return MetricDeterminism.DETERMINISTIC
    elif metric_name == 'bleu' and metric_version == 1:
        return MetricDeterminism.DETERMINISTIC
    elif metric_name == 'rouge l' and metric_version == 1:
        return MetricDeterminism.DETERMINISTIC
    elif metric_name == 'semantic similarity' and metric_version == 1:
        return MetricDeterminism.NEURAL
    elif metric_name == 'semantic answer similarity' and metric_version == 1:
        return MetricDeterminism.NEURAL
    elif metric_name == 'perplexity' and metric_version == 1:
        return MetricDeterminism.DETERMINISTIC
    elif metric_name == 'chatbot classifier' and metric_version == 1:
        return MetricDeterminism.PROBABILISTIC
    elif metric_name == 'distinct' and metric_version == 1:
        return MetricDeterminism.DETERMINISTIC
    elif metric_name == 'emotion classifier' and metric_version == 1:
        return MetricDeterminism.NEURAL
    elif metric_name == 'dummy metric':
        return MetricDeterminism.DETERMINISTIC
    else:
        logging.error("Unknown determinism for metric " + metric_name)
        
def get_metric_dependency(metric_name, metric_actors):
    actors_order = ['training_set', 'predictor', 'reference', 'document', 'document0', 'document1'] 
    actor_types = [metric_actors[key][0] for key in actors_order if key in metric_actors]
    actor_chars = [metric_actors[key][1] for key in actors_order if key in metric_actors]
    if metric_name == 'lines count' or metric_name == 'distinct' or metric_name == 'emotion classifier':
        if all(at.value < 10 for at in actor_types):
            return MetricDependency.DATASET
        elif actor_chars[0] == 'Base':
            return MetricDependency.DATASET
        else:
            return MetricDependency.COHERENT
    elif metric_name == 'bleu' or metric_name == 'rouge l' or \
         metric_name == 'semantic answer similarity' or metric_name == 'chatbot classifier' or metric_name == 'perplexity':
        if all(at.value < 10 for at in actor_types):
            return MetricDependency.DATASET
        elif actor_types[0].value < 10 and actor_chars[1] == 'Base':
            return MetricDependency.DATASET
        elif actor_chars[0] == actor_chars[1]:
            return MetricDependency.COHERENT
        elif all(at.value >= 10 for at in actor_types):
            return MetricDependency.COMPARATIVE
        else:
            return MetricDependency.ADVERSARIAL
    elif metric_name == 'dummy metric':
        return MetricDependency.DATASET
    else:
        logging.error("Unknown dependency for metric " + metric_name)

# Save Example

In [55]:
# Metric metadata creation
metric_name = 'dummy metric'
metric_name_pretty = 'Dummy Metric'
metric_version = 1
metric_actors = {
    "document": [
        MetricActor.DATASET_CHAR,
        "Barney"
    ],
    "training_set": [
        MetricActor.DATASET_CHAR,
        "Barney"
    ]
}
metric_result = {
    "score": 0.9984034299850464,
    "std": 0.027748608961701393
}
metric_attempt = 0
metric_context = {                          
    "dialogpt_size": "small",
    "dialogpt_context_sentences": 5,
    "dialogpt_nbeams_beams": 3,
    "dialogpt_sample_top_p": 0.92,
    "dialogpt_sample_top_k": 50
}
metric_params = {}
metric_samples = 'Unknown'
metric_hash = dict_hash({'metric_name': metric_name,
                         'metric_version': metric_version,
                         'metric_attempt': metric_attempt,
                         'metric_actors': metric_actors,
                         'context': metric_context,
                         'metric_params': metric_params,
                         'metric_samples': metric_samples})

# This is the important one. Each metric should contain all these entries
metric_dict = {
        "metric_name": metric_name,           # Unique name of the metric
        "metric_version": metric_version,     # Metric version (useful if you change how a metric works and recompute)
        "metric_attempt": metric_attempt,     # Incremental value for multiple computations of the same metric (e.g. for std)
        "metric_actors": metric_actors,       # Who this metric is computed on
        "metric_dependency": get_metric_dependency(metric_name, metric_actors), # Is this metric a function of data or chatbot?
        "metric_params": metric_params,       # Additional params of the metric (e.g. ngram_size for distinct)
        "context": metric_context,            # External parameters, such as chatbot characteristics
        "metric_arity": get_metric_arity(metric_name), # Metric arity
        "metric_samples": metric_samples,     # Batch size of the metric
        "metric_determinism": get_metric_determinism(metric_name, metric_version), # Is this metric algorithmic or not?
        "answer": metric_result,              # Score of the metric, may include std (Any dictionary can go here)
        "hash": metric_hash                   # Unique hash for this metric, used to not store duplicates
    }

metric_dict = {
    metric_hash: metri
}
printer.pprint(metric_dict) # Metric is now ready to be saved!

{'answer': {'score': 0.9984034299850464, 'std': 0.027748608961701393},
 'context': {'dialogpt_context_sentences': 5, 'dialogpt_nbeams_beams': 3, 'dialogpt_sample_top_k': 50, 'dialogpt_sample_top_p': 0.92, 'dialogpt_size': 'small'},
 'hash': '220b59ba420e4af16b9217f05635815d',
 'metric_actors': {'document': [<MetricActor.DATASET_CHAR: 1>, 'Barney'], 'training_set': [<MetricActor.DATASET_CHAR: 1>, 'Barney']},
 'metric_arity': <MetricArity.PAIRWISE: 2>,
 'metric_attempt': 0,
 'metric_dependency': <MetricDependency.DATASET: 0>,
 'metric_determinism': <MetricDeterminism.DETERMINISTIC: 0>,
 'metric_name': 'dummy metric',
 'metric_params': {},
 'metric_samples': 'Unknown',
 'metric_version': 1}


In [56]:
def save_metric_by_name(metric_name_pretty, metric_dict):
    # Multiple metrics are stored as a dictionary, indexed by hash. Load the existing ones
    if os.path.exists(os.path.join(in_metrics_folder, metric_name_pretty)):
        metrics = load_from_json(in_metrics_folder, metric_name_pretty)
    else:
        metrics = dict()
    # Add new entry
    metrics.update(metric_dict)
    # Save metrics file
    save_as_json(in_metrics_folder, metric_name_pretty, metrics)

In [None]:
save_metric_by_name("Dummy Metric", metric_dict)

# Metric Loading

In [59]:
def load_metric_by_name(metric_name_pretty):
    metrics = load_from_json(in_metrics_folder, metric_name_pretty)
    for entry in metrics.values():
        for actor in entry['metric_actors'].values():
            actor[0] = MetricActor(actor[0])
        entry['metric_dependency'] = MetricDependency(entry['metric_dependency'])
        entry['metric_determinism'] = MetricDependency(entry['metric_determinism'])
        entry['metric_arity'] = MetricArity(entry['metric_arity'])
    return metrics

In [60]:
metrics = load_metric_by_name("Dummy Metric")
print(metrics)

{'220b59ba420e4af16b9217f05635815d': {'metric_name': 'dummy metric', 'metric_version': 1, 'metric_attempt': 0, 'metric_actors': {'document': [<MetricActor.DATASET_CHAR: 1>, 'Barney'], 'training_set': [<MetricActor.DATASET_CHAR: 1>, 'Barney']}, 'metric_dependency': <MetricDependency.DATASET: 0>, 'metric_params': {}, 'context': {'dialogpt_size': 'small', 'dialogpt_context_sentences': 5, 'dialogpt_nbeams_beams': 3, 'dialogpt_sample_top_p': 0.92, 'dialogpt_sample_top_k': 50}, 'metric_arity': <MetricArity.PAIRWISE: 2>, 'metric_samples': 'Unknown', 'metric_determinism': <MetricDependency.DATASET: 0>, 'answer': {'score': 0.9984034299850464, 'std': 0.027748608961701393}, 'hash': '220b59ba420e4af16b9217f05635815d'}}
