# Evaluate distinct1 and 2 for the e2e generations

We report degree of diversity by calculating the number of distinct unigrams and bigrams in generated responses. <br>
The value is scaled by total number of generated tokens to avoid favoring long sentences (shown as distinct-1 and distinct-2 in Tables 2 and 3) [Li et al, 2016](https://aclanthology.org/N16-1014.pdf)

Implementation is following [Xu et al, 2022](https://aclanthology.org/2022.dialdoc-1.10.pdf) ([GitHub](https://github.com/HLTCHKUST/KnowExpert/blob/main/evaluation.py))

In [1]:
import pandas as pd

In [2]:
import collections

def get_ngrams(text, n):
    """
    Returns all ngrams that are in the text.
    Note: this function does NOT lowercase text. If you want to lowercase, you should
    do so before calling this function.
    Inputs:
      text: string, space-separated
      n: int
    Returns:
      list of strings (each is a ngram, space-separated)
    """
    tokens = text.split()
    return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-(n-1))]  # list of str

"""
Compute distinct metrics in corpus level
"""
def _distinct_corpus_n(samples, n):
    """
    Returns (total number of unique ngrams in story_text) / (total number of ngrams in story_text, including duplicates).
    Text is lowercased before counting ngrams.
    Returns None if there are no ngrams
    """
    # ngram_counter maps from each n-gram to how many times it appears
    ngram_counter = collections.Counter()
    for sample in samples:
        ngrams = get_ngrams(sample.strip().lower(), n)
        ngram_counter.update(ngrams)
    if sum(ngram_counter.values()) == 0:
        return None
    return len(ngram_counter) / sum(ngram_counter.values())

def distinct_corpus_1(samples):
    return _distinct_corpus_n(samples, 1)

def distinct_corpus_2(samples):
    return _distinct_corpus_n(samples, 2)

def distinct_corpus_3(samples):
    return _distinct_corpus_n(samples, 3)

def get_corpus_distinct(pred):
    return (distinct_corpus_1(pred), distinct_corpus_2(pred), distinct_corpus_3(pred))

In [3]:
import json

f = open ("/home/willy/instructod/src/e2e/results/delex_preds.json", "r")
data = json.loads(f.read())
f.close()

In [4]:
keys, d1s, d2s = [], [], []
for key in data.keys():
    keys.append(key)
    d1s.append(distinct_corpus_1(data[key]))
    d2s.append(distinct_corpus_2(data[key]))
    
df_result = pd.DataFrame({'Type':keys,
                          'D-1':d1s,
                          'D-2':d2s})
df_result

Unnamed: 0,Type,D-1,D-2
0,gold,0.094607,0.35994
1,rg,0.056161,0.202344
2,e2e,0.07265,0.263295
3,e2e_multi,0.05349,0.187482
4,pptod,0.015454,0.056267
