Created on Sat Dec  9 19:24:25 2023
@author: Michelle Fribance

The purpose of this script is to train a Markov Chain model on the combined
fortunes dataset, using the markovify library: https://github.com/jsvine/markovify
then generate a set of unique fortunes.

markovify isn't available through Anaconda; must install using pip on your desired env:
pip install markovify

Key parameters of the markovify library:
state_size (default=2):
    - Determines the number of words that form the state of the Markov Chain.
    - Larger values generally lead to more coherent but less diverse text.

chain (default=None):
    - If you have a pre-built chain (possibly from a previous run), you can
      provide it to the model using this parameter.
   eg:
      chain = markovify.Chain.from_text(" ".join(proverbs), state_size=2)
      text_model = markovify.Text("", chain=chain)

max_overlap_ratio (default=0.7):
    - This parameter controls the maximum allowed ratio of overlapping words
      between the sentences.

max_overlap_total (default=15):
    - This parameter controls the maximum allowed total number of overlapping
      words between the sentences.

output_max_words (default=200):
    - Maximum number of words in the generated sentence.

tries (default=10):
    - The number of attempts to make a sentence before failing and retrying.

In [None]:
import os
import markovify
import random
import numpy as np
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.lm import MLE
import matplotlib.pyplot as plt
import time

# Check if punkt is already downloaded
try:
    # Check if punkt is found
    nltk.data.find('tokenizers/punkt')
except LookupError:
    # If not found, download punkt
    nltk.download('punkt')

# Check if wordnet is already downloaded
try:
    # Check if punkt is found
    nltk.data.find('corpora/wordnet')
except LookupError:
    # If not found, download punkt
    nltk.download('wordnet')

In [None]:
# ------------------- Set variable values and model parameters ------------------ #
num_fortunes_to_generate = 100
tries = 100

# Set hyperparameter values:
state_size_values = [2, 3]  # 4 is too slow. Can't generate enough unique fortunes so loops forever
max_words_values = [15, 18]
max_overlap_ratio_values = [0.5, 0.7, 0.9]
cosine_similarity_threshold_values = [0.5, 0.7]
cosine_sim_values = ["true", "false"]        # filter dissimilar generated fortunes?

# Set a seed value for reproducibility
seed_value = 42
random.seed(seed_value)  # Sets the seed for the Python random number generator

# Set the display.max_colwidth option to None to show full text in a column
pd.set_option('display.max_colwidth', None)

# Create an empty DataFrame to store the optimization results
hyperparameter_results_df = pd.DataFrame(columns=["State_Size", "Max_Words", "Max_Overlap_Ratio", "Cosine_Similarity_Threshold", "Cosine_Similarity", "Average_Perplexity"])


In [None]:
############################ Function definitions #############################

def filter_fortunes_with_cosine_similarity(df_generated_fortunes, train_fortunes):
    """ Removes fortunes with too-low similarity to the training set. For word
        embeddings using spaCy, we use the pre-trained spaCy model
        "en_core_web_md" (medium-sized English model). This model includes word vectors,
        and it should work well for general-purpose applications, including fortunes."""

    # Load the model:
    try:
        nlp = spacy.load("en_core_web_md")
    except OSError:
        print("Downloading 'en_core_web_md' model...")
        spacy.cli.download("en_core_web_md")
        nlp = spacy.load("en_core_web_md")

    # Tokenize the training fortunes:
    training_tokens = [nlp(fortune) for fortune in train_fortunes]

    # Calculate the average vector of all training fortunes:
    average_training_vector = np.mean([token.vector for tokens in training_tokens for token in tokens],
                                      axis=0)

    # Function to calculate cosine similarity with the average training vector:
    def calculate_cosine_similarity_with_average(text):
        if text is None:
            return 0.0
        tokens = nlp(text)
        vector = np.mean([token.vector for token in tokens], axis=0)
        similarity = cosine_similarity([average_training_vector], [vector])[0][0]
        return similarity

    # Apply the cosine similarity function while handling None values
    df_generated_fortunes["Passes_Threshold"] = df_generated_fortunes["Generated fortunes"].apply(
        lambda x: calculate_cosine_similarity_with_average(x) if x is not None else 0.0)

    # Filter out rows with None values in the "Generated fortunes" column
    df_generated_fortunes = df_generated_fortunes.dropna(subset=["Generated fortunes"])

    cosine_similarity_threshold = 0.7
    df_generated_fortunes = df_generated_fortunes.copy()
    df_generated_fortunes.loc[:, "Passes_Threshold"] = (df_generated_fortunes["Passes_Threshold"] >= cosine_similarity_threshold)

    # Filter out generated fortunes below the threshold:
    filtered_fortunes = df_generated_fortunes[df_generated_fortunes["Passes_Threshold"]]

    # Drop the temporary column
    filtered_fortunes = filtered_fortunes.drop(columns=["Passes_Threshold"])

    return filtered_fortunes


def evaluate_fortune_perplexity(input_fortunes_df, training_fortunes_list):
    # Evaluate by calculating perplexity for each fortune

    # https://www.nltk.org/api/nltk.lm.html

    nlp = spacy.load("en_core_web_md")  # Load pre-trained spaCy model with word vectors

    # Tokenize training fortunes and pad with special characters at each sequence's start & end
    train_data = [nlp(sentence) for sentence in training_fortunes_list]
    vocabulary = [token.text for tokens in train_data for token in tokens]

    # Create an NLTK model for the reference data:
    n = 2
    nltk_model = MLE(n)

    # Convert training data into n-grams
    train_ngrams = list(nltk.everygrams(vocabulary, max_len=n))

    # Fit the model
    nltk_model.fit([train_ngrams], vocabulary_text=vocabulary)

    # Define a function to calculate perplexity for a given sentence using the trained NLTK model:
    def calculate_perplexity(sentence, model, n):
        if sentence is None:
            return float('inf')  # Return infinity for None values
        tokens = nlp(sentence)
        ngrams = list(nltk.everygrams(tuple([token.text for token in tokens]), max_len=n))
        return model.perplexity(ngrams)

    # Add a new column to the DataFrame to store perplexity values
    input_fortunes_df["perplexity"] = input_fortunes_df["Generated fortunes"].apply(
        lambda x: calculate_perplexity(x, nltk_model, n))

    print(input_fortunes_df[["Generated fortunes", "perplexity"]])

    # Count number of fortunes with "inf" perplexity:
    inf_perplexity_count = len(input_fortunes_df[input_fortunes_df["perplexity"] == float('inf')])

    # Filter out rows with "inf" perplexity:
    valid_perplexity_df = input_fortunes_df[input_fortunes_df["perplexity"] != float('inf')]

    return valid_perplexity_df, inf_perplexity_count

######################## End of Function Definitions ##########################

In [None]:
# -------------------------------- Load data -------------------------------- #

# Open the original combined_fortunes dataset
training_fortunes_path = os.path.join('..', 'datasets', 'combined_fortunes-4632.csv')

# TODO What is dataset_path ?
with open(dataset_path, 'r') as file:
    fortunes = file.readlines()

train_fortunes = fortunes

# Define a start time to time how long the optimization takes
start_time = time.time()

number_of_runs = (len(state_size_values) *
                  len(max_words_values) *
                  len(max_overlap_ratio_values) *
                  len(cosine_similarity_threshold_values) *
                  len(cosine_sim_values))

print(f"Starting hyperparameter optimization using perplexity as evaluation metric on {number_of_runs} combinations...")
i = 1
# Loop through each combination of hyperparameters
for state_size in state_size_values:
    for max_words in max_words_values:
        for max_overlap_ratio in max_overlap_ratio_values:
            for cosine_similarity_threshold in cosine_similarity_threshold_values:
                for cosine_sim in cosine_sim_values:
                    print(f"Run {i} of {number_of_runs}:")
                    print("\nHyperparameter values:")
                    print(f"State_Size: {state_size}, Max_Words: {max_words}, Max_Overlap_Ratio: {max_overlap_ratio}")
                    print(f"Cosine_Similarity_Threshold: {cosine_similarity_threshold}, Cosine_Similarity: {cosine_sim}\n")
                    # Build the Markov Chain model
                    text_model = markovify.Text(" ".join(train_fortunes), state_size=state_size)

                    # Generate a set of fortunes and save to a DataFrame
                    df_generated_fortunes = None
                    generated_fortunes = []
                    while len(generated_fortunes) < num_fortunes_to_generate:
                        fortune = text_model.make_sentence(max_words=max_words, max_overlap_ratio=max_overlap_ratio, tries=tries)
                        if fortune is not None and fortune not in generated_fortunes:
                            generated_fortunes.append(fortune)
                    df_generated_fortunes = pd.DataFrame({"Generated fortunes": generated_fortunes})

                    # Check for duplicates and remove
                    df_generated_fortunes.drop_duplicates(inplace=True)

                    # If there are less than 100 fortunes, generate more until there are 100
                    while len(df_generated_fortunes) < num_fortunes_to_generate:
                        additional_fortunes = []
                        while len(additional_fortunes) < num_fortunes_to_generate - len(df_generated_fortunes):
                            fortune = text_model.make_sentence(max_words=max_words, max_overlap_ratio=max_overlap_ratio, tries=tries)
                            if fortune is not None and fortune not in df_generated_fortunes["Generated fortunes"].values:
                                additional_fortunes.append(fortune)
                        df_generated_fortunes = pd.concat([df_generated_fortunes, pd.DataFrame({"Generated fortunes": additional_fortunes})], ignore_index=True)
                        df_generated_fortunes.drop_duplicates(inplace=True)

                    # Filter out fortunes below the threshold
                    if cosine_sim == "true":
                        filtered_fortunes = filter_fortunes_with_cosine_similarity(df_generated_fortunes, train_fortunes)
                    else:
                        filtered_fortunes = df_generated_fortunes

                    # Evaluate generated fortunes by calculating perplexity
                    valid_perplexity_df, inf_perplexity_count = evaluate_fortune_perplexity(input_fortunes_df=filtered_fortunes, training_fortunes_list=train_fortunes)

                    # Calculate average perplexity value for the entire valid_perplexity_df
                    average_perplexity = valid_perplexity_df["perplexity"].mean()

                    # Store the hyperparameters and resulting perplexity in a dictionary
                    result_dict = {
                        "State_Size": state_size,
                        "Max_Words": max_words,
                        "Max_Overlap_Ratio": max_overlap_ratio,
                        "Cosine_Similarity_Threshold": cosine_similarity_threshold,
                        "Cosine_Similarity": cosine_sim,
                        "Average_Perplexity": average_perplexity,
                        "Inf_Perplexity_Count": inf_perplexity_count
                    }

                    # Append the dictionary to the DataFrame
                    hyperparameter_results_df = pd.concat([hyperparameter_results_df, pd.DataFrame(result_dict, index=[0])], ignore_index=True)

                    i += 1 # Increment the run counter (only relevant for printing which run you're on)

# Calculate the script execution time
end_time = time.time()
execution_time = end_time - start_time
print(f"Script execution time: {execution_time} seconds")

In [None]:
# ------------------------- Export results to CSV --------------------------- #

# Check for or create "hyperparameter_optimization" folder, to store results:
hyperparameter_optimization_folder = os.path.join(Markov_Chains_folder_path, "hyperparameter_optimization")
if not os.path.exists(hyperparameter_optimization_folder):
    os.makedirs(hyperparameter_optimization_folder)

# Export the generated DataFrame to a CSV file:
csv_file_path = os.path.join(hyperparameter_optimization_folder,
                             "hyperparameter_optimization_results-perplexity.csv")
hyperparameter_results_df.to_csv(csv_file_path, index=False)

print(f"\nHyperparameter optimization results exported to: {csv_file_path}")


In [None]:
# ------------------------- Visualize the results ---------------------------- #

# Create subplots for each value of cosine similarity
fig, axs = plt.subplots(2, 1, figsize=(10, 12), sharex=True)

# Loop through each value of cosine similarity
for idx, cosine_sim in enumerate(cosine_sim_values):
    # Create a color map for different state sizes
    colors = ['r', 'g', 'b']

    # Initialize variables to track lowest perplexity and its coordinates
    min_perplexity = float('inf')
    min_x, min_y = None, None

    # Loop through each hyperparameter
    for state_size, color in zip(state_size_values, colors):
        for max_words, marker in zip(max_words_values, ['o', 's']):
            # Filter results for the current combination of hyperparameters
            filtered_results = hyperparameter_results_df[(hyperparameter_results_df['State_Size'] == state_size) &
                                                         (hyperparameter_results_df['Max_Words'] == max_words) &
                                                         (hyperparameter_results_df['Cosine_Similarity'] == cosine_sim)]

            # Plot the perplexity values
            axs[idx].plot(filtered_results['Max_Overlap_Ratio'], filtered_results['Average_Perplexity'],
                          marker=marker, linestyle='', color=color, label=f"State Size: {state_size}, Max Words: {max_words}")

            # Find the lowest perplexity value and its coordinates
            min_idx = filtered_results['Average_Perplexity'].idxmin()
            x_coord = filtered_results.loc[min_idx, 'Max_Overlap_Ratio']
            y_coord = filtered_results.loc[min_idx, 'Average_Perplexity']

            # Check if this is the lowest perplexity so far
            if y_coord < min_perplexity:
                min_perplexity = y_coord
                min_x, min_y = x_coord, y_coord

    # Set y label
    axs[idx].set_ylabel('Perplexity')

    # Add title
    axs[idx].set_title(f'Perplexity vs Hyperparameters (Cosine Similarity: {cosine_sim})')

    # Annotate the lowest perplexity marker
    if min_x is not None and min_y is not None:
        axs[idx].annotate(f'Lowest Perplexity: {min_perplexity:.2f}',
                          xy=(min_x, min_y), xytext=(min_x + 0.01, min_y))  # Adjusted xytext parameter

# Set x label for the bottom plot
axs[-1].set_xlabel('Max Overlap Ratio')

# Set x-axis limit
axs[-1].set_xlim(0.4, 1.0)

# Add legend to the bottom plot
axs[-1].legend()

# Save the plot as an image file
image_file_path = os.path.join(hyperparameter_optimization_folder, "hyperparameter_results_plot-perplexity.png")
plt.savefig(image_file_path)

# Show the plot
plt.show()
