In [1]:
# Function to safely extract a specific line if it exists
def safe_get_line(entry, line_index):
    lines = entry.split("\n")
    if len(lines) > line_index:
        return lines[line_index]
    else:
        return None  # or some default value or handle error

with open("data/hd/initial_combined/test_combined.txt", 'r') as file:
    stories = file.read().split("\n\n")

just_stories = [story.split("\n")[1] for story in stories[0:100]]


### **Linguistic Diversity Evaluation**
To assess the linguistic quality of the first 100 stories real stories from the test dataset of WritingsPrompts, we will evaluate them against a comprehensive set of linguistic metrics. 

In [33]:
from metrics.LexicalDiversity.lexical_diversity import *
from metrics.SemanticDiversity.sementic_diversity import *
from metrics.SyntacticDiversity.syntactic_diversity import *
from nltk.tokenize import sent_tokenize
import pandas as pd
import spacy

# Define the column names
columns = ["Distinct-2", "Distinct-3", "Self-BLEU", "OV-TTR", "MS-TTR", "S-DIV-AV", "S-DIV-C", "SYN-DIV"]

# Create an empty DataFrame with these columns
df_eval = pd.DataFrame(columns=columns)

# Load a spaCy model for dependency parsing
nlp = spacy.load("en_core_web_sm")

for story in just_stories:
    print(just_stories.index(story))

    # Tokenize the text into sentences
    sentences = sent_tokenize(story)
    graphs = construct_dependency_graphs(sentences)

    #A new row of data
    new_data = {
        "Distinct-2": calculate_distinct_n(story, 2),
        "Distinct-3": calculate_distinct_n(story, 3),
        "Self-BLEU": 1-calculate_self_bleu(sentences),
        "OV-TTR": calculate_ttr(story, truncate_length=300),
        "MS-TTR": calculate_mean_segmental_ttr(story, segment_size=50),
        "S-DIV-AV": calculate_semantic_diversity(sentences, 'average'),
        "S-DIV-C": calculate_semantic_diversity(sentences, 'centroid'),
        "SYN-DIV": calculate_syntactic_diversity(graphs)
    }

    # Convert new_data dictionary to a DataFrame
    new_row_df = pd.DataFrame([new_data])

    # Concatenate the new row DataFrame to the original DataFrame
    df_eval = pd.concat([df_eval, new_row_df], ignore_index=True)
        

# Calculate the mean for each column and append as a new row
averages = df_eval.mean().to_dict()
averages = {key: [value] for key, value in averages.items()}  # Convert each mean value into a list
average_df = pd.DataFrame(averages)  # Create a DataFrame for the averages
average_df.index = ['Average']  # Label the index as 'Average'

# Append the average row to the original DataFrame
df = pd.concat([df_eval, average_df])

# Specify the file path and name
file_path = './outputs/test_rd/lg_eval_table_rd.csv'

# Write the DataFrame to a CSV file
df.to_csv(file_path, index=False)  # Set index=False to not include row indices in the file

print(f"Data has been written to {file_path}")
# Print the last row (average values)
print("Average values for each metric:")
print(df.iloc[-1])

0


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  df_eval = pd.concat([df_eval, new_row_df], ignore_index=True)
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Data has been written to ./outputs/test_rd/eval_table_rd.csv
Average values for each metric:
Distinct-2    0.857847
Distinct-3    0.958945
Self-BLEU     0.952630
OV-TTR        0.526975
MS-TTR        0.774601
S-DIV-AV      0.802330
S-DIV-C       0.527642
SYN-DIV       0.780589
Name: Average, dtype: float64
