In [48]:
import re


def calculate_distinct_n(text, n=2):
    """
    Calculate the Distinct-n metric for a given text. This metric evaluates the diversity of generated text 
    by counting the number of unique sequences of n words (n-grams).

    Parameters:
    - text (str): The text to be analyzed.
    - n (int): The length of the n-gram (e.g., 2 for Distinct-2).

    Returns:
    - float: The Distinct-n score as the proportion of unique n-grams to the total number of n-grams.
    """

    # Remove special characters and normalize to lowercase
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    # Split the cleaned text into tokens
    tokens = cleaned_text.split()
    #print(tokens)
    # Generate n-grams from the list of tokens
    n_grams = [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    #print(len(n_grams))
    # Calculate the number of unique n-grams
    unique_n_grams = len(set(n_grams))
    #print(len(set(n_grams)))
    # Calculate the total number of n-grams
    total_n_grams = len(n_grams)
    
    # Calculate the Distinct-n score
    distinct_n_score =  unique_n_grams/total_n_grams if total_n_grams > 0 else 0
    print(distinct_n_score)
    return distinct_n_score
    #return total_n_grams

In [54]:
#from metrics.LexicalDiversity.lexical_diversity import *
#from metrics.SemanticDiversity.sementic_diversity import *
#from metrics.SyntacticDiversity.syntactic_diversity import *
#from nltk.tokenize import sent_tokenize
import pandas as pd
#import spacy

# Define the column names
#columns = ["Distinct-2", "Distinct-3", "Self-BLEU", "OV-TTR", "MS-TTR", "S-DIV-AV", "S-DIV-C", "SYN-DIV"]
columns = ["Distinct-2"]

# Create an empty DataFrame with these columns
df_eval = pd.DataFrame(columns=columns)

# Load a spaCy model for dependency parsing
#nlp = spacy.load("en_core_web_sm")

with open("./outputs/gen7/stories7.txt", 'r') as f:
#with open("data/hd/initial_combined/test_combined.txt") as f:
    stories = f.read().split("\n\n")
#stories = '\n\n'.join(stories)

for story in stories:
    #print(stories.index(story))

    # Tokenize the text into sentences
    #sentences = sent_tokenize(story)
    #graphs = construct_dependency_graphs(sentences)

    #A new row of data
    new_data = {
        "Distinct-2": calculate_distinct_n(story, 2),
        #"Distinct-3": calculate_distinct_n(story, 3),
        #"Self-BLEU": 1-calculate_self_bleu(sentences),
        #"OV-TTR": calculate_ttr(story, truncate_length=300),
        #"MS-TTR": calculate_mean_segmental_ttr(story, segment_size=50),
        #"S-DIV-AV": calculate_semantic_diversity(sentences, 'average'),
        #"S-DIV-C": calculate_semantic_diversity(sentences, 'centroid'),
        #"SYN-DIV": calculate_syntactic_diversity(graphs)
    }

    # Convert new_data dictionary to a DataFrame
    new_row_df = pd.DataFrame([new_data])

    # Concatenate the new row DataFrame to the original DataFrame
    df_eval = pd.concat([df_eval, new_row_df], ignore_index=True)
        

# Calculate the mean for each column and append as a new row
averages = df_eval.mean().to_dict()
averages = {key: [value] for key, value in averages.items()}  # Convert each mean value into a list
average_df = pd.DataFrame(averages)  # Create a DataFrame for the averages
average_df.index = ['Average']  # Label the index as 'Average'

# Append the average row to the original DataFrame
df = pd.concat([df_eval, average_df])

# Specify the file path and name
file_path = './outputs/gen5/eval_table_gen5.csv'

# Write the DataFrame to a CSV file
#df.to_csv(file_path, index=False)  # Set index=False to not include row indices in the file

print(f"Data has been written to {file_path}")
# Print the last row (average values)
print("Average values for each metric:")
print(df.iloc[-1])

1.0089020771513353
1.0025445292620865
1.0146520146520146
1.008888888888889
1.0099750623441397
1.1120218579234973
1.0093896713615023
1.0057971014492753
1.0094117647058825
1.0
1.002433090024331
1.0
1.0024813895781637
1.002932551319648
1.0054347826086956
1.0049261083743843
1.0
1.0094562647754137
1.0
1.011764705882353
1.0212201591511936
1.0106666666666666
1.0191846522781776
1.002659574468085
1.005012531328321
1.0184757505773672
1.0058997050147493
1.0026455026455026
1.005050505050505
1.0229591836734695
1.0
1.0056497175141244
1.0107913669064748
1.0470588235294118
1.0072463768115942
1.0024271844660195
1.0
1.0099502487562189
1.0024813895781637
1.0097560975609756
1.004739336492891
1.0115273775216138
1.00990099009901
1.0185676392572944
1.005730659025788
1.029810298102981
1.0033333333333334
1.0080645161290323
1.0154639175257731
1.00625
1.0053191489361701
1.0102827763496145
1.0
1.0416666666666667
1.0028328611898016
1.0
1.0085714285714287
1.0070921985815602
1.0
1.015075376884422
1.0595533498759304


  df_eval = pd.concat([df_eval, new_row_df], ignore_index=True)
