In [1]:
def calculate_distinct_n(text, n=2):
    """
    Calculate the Distinct-n metric for a given text. This metric evaluates the diversity of generated text 
    by counting the number of unique sequences of n words (n-grams).

    Parameters:
    - text (str): The text to be analyzed.
    - n (int): The length of the n-gram (e.g., 2 for Distinct-2).

    Returns:
    - float: The Distinct-n score as the proportion of unique n-grams to the total number of n-grams.
    """
    # Normalize the text to lowercase and split into words
    tokens = text.lower().split()
    
    # Generate n-grams from the list of tokens
    n_grams = [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    print("Total n_grams: ", len(n_grams))
    # Calculate the number of unique n-grams
    unique_n_grams = len(set(n_grams))
    print("Unique n_grams: ", unique_n_grams)
    # Calculate the total number of n-grams
    total_n_grams = len(n_grams)
    
    # Calculate the Distinct-n score
    distinct_n_score = unique_n_grams / total_n_grams if total_n_grams > 0 else 0
    
    return distinct_n_score


In [2]:

# Example text 
#PROMPT: Generate a Story about love.
#gpt3.5
file_path1 = "/Users/Vas/Documents/Coding_Projects/BA_Experiment_Tests/Metrics/sample1.txt"
#gpt4
file_path2 = "/Users/Vas/Documents/Coding_Projects/BA_Experiment_Tests/Metrics/sample2.txt"

with open(file_path1, 'r', encoding="utf-8") as file:
    text1 = file.read()
with open(file_path2, 'r', encoding="utf-8") as file:
    text2 = file.read()

distinct_2_1 = calculate_distinct_n(text1, 2)  # For two-word combinations
distinct_2_2 = calculate_distinct_n(text2, 2)  # For two-word combinations
distinct_3_1 = calculate_distinct_n(text1, 3)  # For three-word combinations
distinct_3_2 = calculate_distinct_n(text2, 3)  # For three-word combinations
print(f'Distinct-2, Text 1 score: {distinct_2_1}')
print(f'Distinct-2, Text 2 score: {distinct_2_2}')
print(f'Distinct-3, Text 1 score: {distinct_3_1}')
print(f'Distinct-3, Text 2 score: {distinct_3_2}')


Total n_grams:  399
Unique n_grams:  365
Total n_grams:  569
Unique n_grams:  522
Total n_grams:  398
Unique n_grams:  387
Total n_grams:  568
Unique n_grams:  560
Distinct-2, Text 1 score: 0.9147869674185464
Distinct-2, Text 2 score: 0.9173989455184535
Distinct-3, Text 1 score: 0.9723618090452262
Distinct-3, Text 2 score: 0.9859154929577465
