In [1]:
from nl2pandas.backend.nli_for_pandas.embedding.BERT import BERT
from nl2pandas.backend.nli_for_pandas.similarity.cosine_similarity import CosineSimilarity
from nl2pandas.backend.nli_for_pandas.data.data import Data

import numpy as np
import itertools
import matplotlib.pyplot as plt

calculate embedding for the different models from sentence transformers

In [3]:
models = [
    "paraphrase-mpnet-base-v2",
    "paraphrase-TinyBERT-L6-v2",
    "paraphrase-distilroberta-base-v2",
    "paraphrase-MiniLM-L12-v2",
    "paraphrase-MiniLM-L6-v2",
    "paraphrase-albert-small-v2",
    "paraphrase-MiniLM-L3-v2",
    "nli-mpnet-base-v2",
    "stsb-mpnet-base-v2",
    "stsb-distilroberta-base-v2",
    "nli-roberta-base-v2",
    "stsb-roberta-base-v2",
    "nli-distilroberta-base-v2",
]

overall = [
    76.84,
    75.36,
    75.15,
    74.81,
    74.38,
    73.94,
    73.55,
    72.45,
    72.12,
    70.07,
    70.00,
    69.89,
    69.86,
    61.57,
    60.52
]

bert_models = [BERT(model) for model in models]

data = Data(file="./evaluation/big_action_set.csv")
data_small = Data(file="./evaluation/small_action_set.csv")
# add some totally unrelated utterances
unrelated = [
    "give me an apple",
    "please send your manager an e-mail",
    "I really like hot dogs",
    "bla bla blublibla",
    "test test test test"
]
for utterance in unrelated:
    data.utterances.append(utterance)
    data.actions.append(utterance)
    data_small.utterances.append(utterance)
    data_small.actions.append(utterance)

In [3]:
# large action set
embeddings = []

for i, bert_model in enumerate(bert_models):
    print(models[i])
    %timeit bert_model.embed(data.utterances[0])
    embeddings.append(bert_model.embed(data.utterances))
    print()

paraphrase-mpnet-base-v2
56.5 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-TinyBERT-L6-v2
31.2 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-distilroberta-base-v2
40.9 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-MiniLM-L12-v2
36.3 ms ± 578 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-MiniLM-L6-v2
19.4 ms ± 243 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

paraphrase-albert-small-v2
41.7 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-MiniLM-L3-v2
9.52 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

nli-mpnet-base-v2
56 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

stsb-mpnet-base-v2
81.7 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

stsb-distilroberta-base-v2
40.8 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

nli-roberta-base-v2
85.8 ms ± 5.78

In [4]:
# create data pairs
indices = range(len(data.utterances))
all_pairs = [(i, j) for i in indices for j in indices]
print("number of utterances:", len(data.utterances))
print("number of pairs:", len(all_pairs))

true_values = [] # 1 is for same, 0 is different action
for i1, i2 in all_pairs:
    true_values.append( int(data.actions[i1] == data.actions[i2]) )
    
true_values = np.array(true_values)
print("number of same:", len([x for x in true_values if x == 1]))
print("number of different:", len([x for x in true_values if x == 0]))

def calc_performance_score(embeddings, sim):
    # first calculate similarities
    similarities = []
    for i1, i2 in all_pairs:
        similarities.append(sim.calculate(embeddings[i1], embeddings[i2]))
    similarities = np.array(similarities)
    
    same = list(itertools.compress(similarities, true_values))
    different = list(itertools.compress(similarities, 1 - true_values))

    return (same, different)

number of utterances: 155
number of pairs: 24025
number of same: 455
number of different: 23570


In [5]:
def visualize(same, diff):
    plt.boxplot([same, diff], [1, 2])
    plt.ylim(0, 1.1)
    plt.xticks([1, 2], ["same", "different"])
    plt.show()

In [16]:
def print_latex_table_entry(name, performance, overall, same, different):
    print(f"{name} & {performance:2.2f} & {100*overall:2.2f} & {100*same:2.2f} & {100*different:2.2f} \\\\")
    print("\hline")

first number is overall performance (average of: mean of similarities for same actions & 1 - mean of similarities for different actions)

first number in parantheses indicates the mean of the similarities for utterances that map to the same action (higher is better)

second number in parantheses is the mean of similarities for utterances that map to different actions (lower is better)

In [7]:
print("Performance for Cosine Similarity")
print()
cosine_sim = CosineSimilarity()
performances = [calc_performance_score(embedding, cosine_sim) for embedding in embeddings]
for i, (same, diff) in enumerate(performances):
    p_same = np.mean(same)
    p_diff = np.mean(diff)
    performance = (p_same + (1 - p_diff)) / 2
    print(f"{performance:.4f} ({p_same:.4f} | {p_diff:.4f})    - {models[i]}")
    # visualize(same, diff)
    # print_latex_table_entry(models[i], overall[i], performance, p_same, p_diff)

Performance for Cosine Similarity



  return np.dot(vector1, vector2) / (


0.7346 (0.8093 | 0.3402)    - paraphrase-mpnet-base-v2
0.7596 (0.7658 | 0.2466)    - paraphrase-TinyBERT-L6-v2
0.7527 (0.7847 | 0.2794)    - paraphrase-distilroberta-base-v2
0.7447 (0.7837 | 0.2943)    - paraphrase-MiniLM-L12-v2
0.7454 (0.7694 | 0.2786)    - paraphrase-MiniLM-L6-v2
0.7445 (0.7519 | 0.2630)    - paraphrase-albert-small-v2
0.7462 (0.7818 | 0.2894)    - paraphrase-MiniLM-L3-v2
0.7075 (0.8039 | 0.3890)    - nli-mpnet-base-v2
0.7225 (0.8019 | 0.3568)    - stsb-mpnet-base-v2
0.7144 (0.7654 | 0.3365)    - stsb-distilroberta-base-v2
0.6840 (0.8284 | 0.4605)    - nli-roberta-base-v2
0.7197 (0.7670 | 0.3275)    - stsb-roberta-base-v2
0.6901 (0.8162 | 0.4359)    - nli-distilroberta-base-v2
nan (nan | nan)    - average_word_embeddings_komninos
nan (nan | nan)    - average_word_embeddings_glove.6B.300d


In [7]:
# small action set
embeddings = []

for i, bert_model in enumerate(bert_models):
    print(models[i])
    %timeit bert_model.embed(data_small.utterances[0])
    embeddings.append(bert_model.embed(data_small.utterances))
    print()

paraphrase-mpnet-base-v2
47.5 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-TinyBERT-L6-v2
24.4 ms ± 336 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-distilroberta-base-v2
29.8 ms ± 2.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-MiniLM-L12-v2
20.5 ms ± 288 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-MiniLM-L6-v2
17.2 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

paraphrase-albert-small-v2
38.9 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

paraphrase-MiniLM-L3-v2
9.7 ms ± 253 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

nli-mpnet-base-v2
75.4 ms ± 809 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

stsb-mpnet-base-v2
76 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

stsb-distilroberta-base-v2
30.5 ms ± 2.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

nli-roberta-base-v2
60 ms ± 3.87 ms pe

In [8]:
# create data pairs
indices = range(len(data_small.utterances))
all_pairs = [(i, j) for i in indices for j in indices]
print("number of utterances:", len(data_small.utterances))
print("number of pairs:", len(all_pairs))

true_values = [] # 1 is for same, 0 is different action
for i1, i2 in all_pairs:
    true_values.append( int(data_small.actions[i1] == data_small.actions[i2]) )
    
true_values = np.array(true_values)
print("number of same:", len([x for x in true_values if x == 1]))
print("number of different:", len([x for x in true_values if x == 0]))

def calc_performance_score(embeddings, sim):
    # first calculate similarities
    similarities = []
    for i1, i2 in all_pairs:
        similarities.append(sim.calculate(embeddings[i1], embeddings[i2]))
    similarities = np.array(similarities)
    
    same = list(itertools.compress(similarities, true_values))
    different = list(itertools.compress(similarities, 1 - true_values))

    return (same, different)

number of utterances: 76
number of pairs: 5776
number of same: 216
number of different: 5560


In [17]:
print("Performance for Cosine Similarity")
print()
cosine_sim = CosineSimilarity()
performances = [calc_performance_score(embedding, cosine_sim) for embedding in embeddings]
for i, (same, diff) in enumerate(performances):
    p_same = np.mean(same)
    p_diff = np.mean(diff)
    performance = (p_same + (1 - p_diff)) / 2
    print(f"{performance:.4f} ({p_same:.4f} | {p_diff:.4f})    - {models[i]}")
    # visualize(same, diff)
    # print_latex_table_entry(models[i], overall[i], performance, p_same, p_diff)

Performance for Cosine Similarity

0.7425 (0.8054 | 0.3204)    - paraphrase-mpnet-base-v2
0.7612 (0.7640 | 0.2415)    - paraphrase-TinyBERT-L6-v2
0.7655 (0.7836 | 0.2527)    - paraphrase-distilroberta-base-v2
0.7532 (0.7781 | 0.2718)    - paraphrase-MiniLM-L12-v2
0.7520 (0.7673 | 0.2632)    - paraphrase-MiniLM-L6-v2
0.7539 (0.7477 | 0.2399)    - paraphrase-albert-small-v2
0.7609 (0.7711 | 0.2494)    - paraphrase-MiniLM-L3-v2
0.7271 (0.7951 | 0.3410)    - nli-mpnet-base-v2
0.7423 (0.8030 | 0.3184)    - stsb-mpnet-base-v2
0.7321 (0.7646 | 0.3004)    - stsb-distilroberta-base-v2
0.7066 (0.8223 | 0.4090)    - nli-roberta-base-v2
0.7377 (0.7653 | 0.2899)    - stsb-roberta-base-v2
0.7130 (0.8124 | 0.3865)    - nli-distilroberta-base-v2
