#### Geometry Score (Khrulkov and Oseledets, 2018)

This notebook requires the access to the [original repository](https://github.com/KhrulkovV/geometry-score), i.e., it has to be cloned and the notebook should be located in the ``geometry-score`` folder

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import gs
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# TODO: provide the path to the embeddings and the dataset
sentence_embeddings = np.load("../tda-human-ai-essays/dataset/sentence_embeddings.npy")
essay_dataset = pd.read_csv("../tda-human-ai-essays/dataset/essay_dataset.csv")

# get the indices of the sentences that are written by the human author
human_indices = essay_dataset[essay_dataset["author"] == "human"].index.to_list()
gpt_indices = essay_dataset[essay_dataset["author"] == "chatgpt"].index.to_list()

# get the human and gpt embeddings
human_embeddings = sentence_embeddings[human_indices]
gpt_embeddings = sentence_embeddings[gpt_indices]

# compute the RLTs on the whole sets
rlts_human = gs.rlts(human_embeddings, gamma=1.0 / 128, i_max=200, n=2582)
rlts_gpt = gs.rlts(gpt_embeddings, gamma=1.0 / 128, i_max=200, n=1842)

# compute the MRLTs
mrlt_human = np.mean(rlts_human, axis=0)
mrlt_gpt = np.mean(rlts_gpt, axis=0)

In [None]:
# Plot the MRLTs

# Set the font size
plt.rcParams.update({"font.size": 15})
palette = sns.diverging_palette(145, 300, n=2)
plt.figure(figsize=(15, 10), dpi=1300)
gs.fancy_plot(mrlt_human, label="MRLT: Human", color=palette[0])
gs.fancy_plot(mrlt_gpt, label="MRLT: GPT", color=palette[1])
plt.xlim([0, 175])
plt.xlabel(r"Number of $1$-dimensional holes $i$")
plt.legend()
plt.savefig("mrlt_human_gpt.pdf", bbox_inches="tight", dpi=1300)
plt.show()

In [None]:
# Compute the Geometry Score
print(f"Human vs. GPT: {1e3 * gs.geom_score(rlts_human, rlts_gpt)}")

In [None]:
# How, let's take independent samples of size 100 from the same distribution and compute the score

rlts_human_1 = gs.rlts(human_embeddings, gamma=1.0 / 128, i_max=200, n=100)
rlts_human_2 = gs.rlts(human_embeddings, gamma=1.0 / 128, i_max=200, n=100)
rlts_gpt_1 = gs.rlts(gpt_embeddings, gamma=1.0 / 128, i_max=200, n=100)
rlts_gpt_2 = gs.rlts(gpt_embeddings, gamma=1.0 / 128, i_max=200, n=100)

mrlt_human_1 = np.mean(rlts_human_1, axis=0)
mrlt_human_2 = np.mean(rlts_human_2, axis=0)
mrlt_gpt_1 = np.mean(rlts_gpt_1, axis=0)
mrlt_gpt_2 = np.mean(rlts_gpt_2, axis=0)

# Compute score
print(
    f"Score for samples within Human: {1e3 * gs.geom_score(rlts_human_1, rlts_human_2)}"
)
print(f"Score for samples within GPT: {1e3 * gs.geom_score(rlts_gpt_1, rlts_gpt_2)}")

In [None]:
# Set the font size
plt.rcParams.update({"font.size": 20})
# Set the palette for the last two plots
palette = sns.diverging_palette(145, 300, n=4)

plt.figure(figsize=(12, 10), dpi=1300)
gs.fancy_plot(mrlt_human_1, label="MRLT: Human, Sample 1", color=palette[0])
gs.fancy_plot(mrlt_human_2, label="MRLT: Human, Sample 2", color=palette[1])
plt.xlim([0, 175])
plt.xlabel(r"Number of $1$-dimensional holes $i$")
plt.legend()
plt.savefig("mrlt_humans.pdf", bbox_inches="tight", dpi=1300)
plt.show()

In [None]:
# Set the font size
plt.rcParams.update({"font.size": 20})
plt.figure(figsize=(12, 10), dpi=1300)
gs.fancy_plot(mrlt_gpt_1, label="MRLT: GPT, Sample 1", color=palette[2])
gs.fancy_plot(mrlt_gpt_2, label="MRLT: GPT, Sample 2", color=palette[3])
plt.xlim([0, 175])
plt.xlabel(r"Number of $1$-dimensional holes $i$")
plt.legend()
plt.savefig("mrlt_gpts.pdf", bbox_inches="tight", dpi=1300)
plt.show()