In [1]:
import os
import sys

# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/BarneyBot'
    sys.path.append(base_folder)
    # Install Huggingface libraries for running the notebook in Colab
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install sentence_transformers")
else:
    base_folder = os.getcwd()

# Import character dictionaries, useful to map a character to its data, and a fixed random seed
from Data.data_dicts import character_dict, source_dict, random_state
# Import BBMetrics library, usefull to performs metric scores
from Lib.BBMetrics import BBMetric    

# Import Huggingface transformers and load_dataset usefull for run the model and load datasets
from transformers import TFAutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

Mounted at /content/drive


In [2]:
# Loads the tokenizer for the pretrained model of DialoGPT small version
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small', cache_dir=os.path.join(os.getcwd(), "cache"))
# Token used for padding by the tokenizer
tokenizer.pad_token = '#'

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

# Human Metrics

In this notebook we ask to the user to perform a subjective evaluation according to some criteria:
* _Coherency_: the chatbot does not contradict themselves over time
* _Consistency_: the chatbot follows the flow of a conversation naturally
* _Stylish_: the chatbot has a distinct personality, including related quirks.

The following function will load the dataset for evaluate the specified dataset (i.e. the common dataset used to evaluate each character bot).

In [3]:
# Loads a common dataset used for evaluate every character bot  
df_common = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))

Using custom data configuration default-9fb6c45fdd9437d1


Downloading and preparing dataset csv/default to /content/drive/My Drive/BarneyBot/cache/csv/default-9fb6c45fdd9437d1/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /content/drive/My Drive/BarneyBot/cache/csv/default-9fb6c45fdd9437d1/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Below the user can find a function which performs 3 step in order to successfully evaluate the character specified in `character` parameter:
1. **Chat evaluation**, for estimating the chatbot coherence (i.e. if the chatbot does not contradict themselves over time)
 * by giving a score from 0 to 5 (half score are not admitted)
 
2. **Responses evaluation**, for estimating the chatbot consistency (i.e. how much true the chatbot' answers regarding to what the user previously said)
 * by giving a score from 0 to 5 (half score are not admitted)
 
3. **Style evaluation**, for estimating the chatbot stylish (i.e. how much close are the answer of the chatbot according to what the user think the real character would say in response to him)
 * by giving a score from 0 to 5 (half score are not admitted)

In [10]:
def eval_character(character='Default', IN_COLAB=False):
    # Takes the source location from the dictionary
    source = character_dict[character]['source']
    
    # Checks if the character was trained 
    character_folder = os.path.join(base_folder, 'Data', 'Characters', character)
    if not os.path.exists(character_folder):
        raise Exception("The character " + character + " doesn't exist")
    
    # Loads the pretrained model from the specified checkpoint folder `checkpoint_folder`
    checkpoint_folder = os.path.join(character_folder, character_dict[character]['checkpoint_folder'])
    model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
    
    ### Compute human - coherence
    print("Step 1) Chat with", character, "\n\tPlease evaluate your chat with this character:", flush=True)
    # Loads the metric
    metric = BBMetric.load_metric("human - coherence")
    # Setting the save path
    file_path = os.path.join(base_folder, "Data", "Characters", character, "humancoherence.csv")
    # Performs the metric evaluation
    metric.train(model=model, tokenizer=tokenizer,
                 filepath=file_path,
                 length=5) # length is optional, defaults to 5
    
    
    ### Compute human - consistency
    print("Step 2) Answers from", character, "\n\tPlease evaluate how true these responses are for the character:", flush=True)
    # Loads the metric
    metric = BBMetric.load_metric("human - consistency")
    # Setting the save path
    file_path = os.path.join(base_folder, "Data", "Characters", character, "humanconsistency.csv")
    # Performs the metric evaluation
    metric.train(model=model, tokenizer=tokenizer,
                 filepath=file_path)
        
        
    ### Compute human - style
    print("Step 3) Answers from", character, "\n\tPlease evaluate the style of the responses.", flush=True)
    print("\tDo you think they are responses that", character, "would say?", flush=True)
    # Loads the metric
    metric = BBMetric.load_metric("human - style")
    # Setting the save path
    file_path = os.path.join(base_folder, "Data", "Characters", character, "humanstyle.csv")
    # Performs the metric evaluation
    metric.train(model=model, tokenizer=tokenizer,
                 filepath=file_path,
                 questions=df_common['train'].filter(lambda x: x['source'] == source)['context'])

# Barney
Evaluation of chatbot of _Barney Stinson_ from _How I Met Your Mother_

In [None]:
eval_character(character='Barney', IN_COLAB=IN_COLAB)

# Sheldon
Evaluation of chatbot of _Sheldon Cooper_ from _The Big Bang Theory_

In [None]:
eval_character(character='Sheldon', IN_COLAB=IN_COLAB)

# Harry
Evaluation of chatbot of _Harry Potter_ from _Harry Potter_ saga

In [None]:
eval_character(character='Harry', IN_COLAB=IN_COLAB)

# Fry
Evaluation of chatbot of _Fry_ from _Futurama_

In [None]:
eval_character(character='Fry', IN_COLAB=IN_COLAB)

# Bender
Evaluation of chatbot of _Bender_ from _Futurama_

In [11]:
eval_character(character='Bender', IN_COLAB=IN_COLAB)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at /content/drive/My Drive/BarneyBot/Data/Characters/Bender/bender_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Step 1) Chat with Bender 
	Please evaluate your chat with this character:
>> User: a
chatbot: Aww, you look so sad.
>> User: a
chatbot: Oh, no!
>> User: a
chatbot: What?
>> User: a
chatbot: Bender, you're so cute!
>> User: a
chatbot: Hey, cut it out!
How do you rate this conversation (0 to 5)? 0
Step 2) Answers from Bender 
	Please evaluate how true these responses are for the character:
Question: Who are you?
chatbot: I'm an expert at being a bender. I bend girders, you see?
Question: What is your name?
chatbot: I'm Bender.
Question: What is your job?
chatbot: Sales assistant.
Question: Where do you live?
chatbot: New York.
How do you rate these answers (0 to 5)? 0
Step 3) Answers from Bender 
	Please evaluate the style of the responses.
	Do you think they are responses that Bender would say?


Loading cached processed dataset at /content/drive/My Drive/BarneyBot/cache/csv/default-9fb6c45fdd9437d1/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-a5eea12c5f280c21.arrow


Question: OK. First Bender, then Flexo, then Fry.
chatbot: I'm not giving my name to a machine.
Question: Just relax, Bender. Tomorrow we'll pry you down, have a nice breakfast and then go hunt down and slaughter that ancient evil.
chatbot: I'm not going near him.
Question: I'm too scared.
chatbot: I'll save me!
Question: Dr. Zoidberg? Are you OK?
chatbot: Uh, I'm fine.
Question: Fry, thank God we found you.
chatbot: I'm so glad to see you, Fry. I'm really starting to swell up with beer.
How do you rate these answers (0 to 5)? 0


# Vader
Evaluation of chatbot of _Darth Vader_ from _Star Wars_

In [None]:
eval_character(character='Vader', IN_COLAB=IN_COLAB)

# Joey
Evaluation of chatbot of _Joey_ from _Friends_

In [None]:
eval_character(character='Joey', IN_COLAB=IN_COLAB)

# Phoebe
Evaluation of chatbot of _Phoebe_ from _Friends_

In [None]:
eval_character(character='Phoebe', IN_COLAB=IN_COLAB)