# Evaluating Data
Using the prepared data, we evaluate the data.

In [None]:
import itertools
import json
import math
import pathlib
import random
import shutil
import socket

from tqdm import tqdm

from scripts.ClientProtocol import ClientProtocol
from scripts.Localizer import Localizer
from scripts.MainProtocol import MainProtocol
from scripts.fast_detect_gpt_start import fast_detect_gpt_start

print("Done")

Constants.

In [None]:
# The number of sentences when using the sliding window
N_SENTENCES = 3

# The location of our data
DATA_DIR = pathlib.Path("./data")

# The IP and Port to set up the TCP connection
IP = "localhost"
PORT = 8080

# The seed to use for RNG
RANDOM_SEED = 42

# If we should use the better detection model
# This is not noted anywhere in the data, so please take not manually
# Also it makes the detector take much longer to start
USE_gpt_j_6B = False
# USE_gpt_j_6B = True

First, we verify if we have prepared data.

In [None]:
prepared_data_dir = DATA_DIR / "prepared"

prepared_ai_modified_texts_dir = prepared_data_dir / "ai_modified_texts"
prepared_unmodified_texts_dir = prepared_data_dir / "unmodified_texts"

if not prepared_ai_modified_texts_dir.exists():
    raise FileNotFoundError(f"Directory not found: {prepared_ai_modified_texts_dir}")
if not prepared_unmodified_texts_dir.exists():
    raise FileNotFoundError(f"Directory not found: {prepared_unmodified_texts_dir}")

print("Done")

Collecting the texts.

In [None]:
prepared_ai_modified_texts_files = []
for file in prepared_ai_modified_texts_dir.iterdir():
    prepared_ai_modified_texts_files.append(file)

prepared_unmodified_texts_files = []
for file in prepared_unmodified_texts_dir.iterdir():
    prepared_unmodified_texts_files.append(file)

print("Done")

Set up an evaluated directory.  
Where we store the evaluated texts.

In [None]:
evaluated_dir = DATA_DIR / "evaluated"

evaluated_ai_modified_texts_dir = evaluated_dir / "ai_modified_texts"
evaluated_unmodified_texts_dir = evaluated_dir / "unmodified_texts"

# Clear the directory if it exists
if evaluated_ai_modified_texts_dir.exists():
    shutil.rmtree(evaluated_ai_modified_texts_dir)
if evaluated_unmodified_texts_dir.exists():
    shutil.rmtree(evaluated_unmodified_texts_dir)

# Recreate them
evaluated_ai_modified_texts_dir.mkdir()
evaluated_unmodified_texts_dir.mkdir()

print("Done")

Set up the evaluator

In [None]:
# Create a socket for the evaluator to connect to
my_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
my_sock.bind((IP, PORT))
my_sock.listen()

# Create the kwargs
_evaluator_kwargs = {
    "--ip": IP,
    "--port": str(PORT)
}
if USE_gpt_j_6B:
    _evaluator_kwargs["--reference_model_name"] = "gpt-j-6B"

# Start the evaluator
evaluator_process = fast_detect_gpt_start(**_evaluator_kwargs)

# Accept the evaluators connection
evaluator_sock, _ = my_sock.accept()

print("Done")

A bunch of helper functions for evaluating the texts.

In [None]:
def evaluate_text(_sock: socket.socket, _text: str) -> tuple[float, float]:
    _sock.sendall(
        MainProtocol.TEXT.create_message(_text)
    )

    _communication_id = ClientProtocol.read_indicator_from(_sock)
    if _communication_id != ClientProtocol.EVALUATION:
        raise ValueError(f"Expected communication id {ClientProtocol.EVALUATION}, got {_communication_id}")
    _crit_and_prob_message = ClientProtocol.EVALUATION.read_rest_of_message_from(_sock)
    _crit_and_prob = _crit_and_prob_message.data

    return _crit_and_prob


Evaluating the unmodified texts.

First we load the data, then we localize it, then send it to the evaluator, and finally save it back to a file.

In [None]:
skips_because_of_nan = 0

# Okay, so starting with the unmodified texts
for file in tqdm(prepared_unmodified_texts_files, desc="Evaluating unmodified texts"):
    # Grab the data from the file
    with file.open() as f:
        data = json.loads(f.read())

    span_to_evaluation = {}

    # Create the localizer
    localizer = Localizer(data["text"])
    # And now we evaluate
    for localized_sentences, _, localized_sentences_sub_spans in localizer.iter_sentences(n_sentences=N_SENTENCES):
        # Send the localized sentences to the evaluator and grab the result
        crit_and_prob = evaluate_text(evaluator_sock, localized_sentences)

        # Store it
        for span in localized_sentences_sub_spans:
            if span not in span_to_evaluation:
                span_to_evaluation[span] = []
            # Only care about the probability right now
            span_to_evaluation[span].append(crit_and_prob[1])

    # Convert the evaluations into something more usable
    # The evaluations at index `i` correspond to the sentence at index `i`
    sentence_evaluations = []
    for span in sorted(span_to_evaluation.keys()):
        sentence_evaluations.append(span_to_evaluation[span])

    # Sometimes, for small inputs, the probability is nan
    # In that case, we will skip this text
    # The if statement in english "if any of the values in sentence_evaluations is nan"
    if any([math.isnan(a) for a in itertools.chain.from_iterable(sentence_evaluations)]):
        skips_because_of_nan += 1
        continue

    # We only want some of the data from the original data
    evaluated_data = {
        "text": data["text"],
        "sentences": data["sentences"],
        "ai_modified": data["ai_modified"],
        "sentence_evaluations": sentence_evaluations
    }

    # Finally we write a new file with the results
    new_file = evaluated_unmodified_texts_dir / file.name
    with new_file.open("w") as f:
        f.write(json.dumps(evaluated_data))

print(f"Number of texts skipped because of NaN: {skips_because_of_nan}")

print("Done")

Repeat for the AI modified texts.

In [None]:
skips_because_of_nan = 0

for file in tqdm(prepared_ai_modified_texts_files, desc="Evaluating AI modified texts"):
    with file.open() as f:
        data = json.loads(f.read())

    span_to_evaluation = {}

    localizer = Localizer(data["text"])
    for localized_sentences, _, localized_sentences_sub_spans in localizer.iter_sentences(n_sentences=N_SENTENCES):
        crit_and_prob = evaluate_text(evaluator_sock, localized_sentences)

        for span in localized_sentences_sub_spans:
            if span not in span_to_evaluation:
                span_to_evaluation[span] = []
            span_to_evaluation[span].append(crit_and_prob[1])

    sentence_evaluations = []
    for span in sorted(span_to_evaluation.keys()):
        sentence_evaluations.append(span_to_evaluation[span])

    if any([math.isnan(a) for a in itertools.chain.from_iterable(sentence_evaluations)]):
        skips_because_of_nan += 1
        continue

    evaluated_data = {
        "text": data["text"],
        "sentences": data["sentences"],
        "ai_modified": data["ai_modified"],
        "sentence_evaluations": sentence_evaluations
    }

    new_file = evaluated_ai_modified_texts_dir / file.name
    with new_file.open("w") as f:
        f.write(json.dumps(evaluated_data))

print(f"Number of texts skipped because of NaN: {skips_because_of_nan}")

print("Done")

Now we terminate the evaluator as we no longer need it.

In [None]:
evaluator_sock.sendall(
    MainProtocol.TERMINATE.create_message()
)
evaluator_sock.close()
my_sock.close()
evaluator_process.wait()

print("Done")

Now we create a working set of the texts.

This includes selecting 20% of the data for evaluation and the remaining 80% for fine-tuning.

Also, there will most likely be a discrepancy in the number of unmodified texts and AI modified texts.  
This will be due to the modification processing dropping any failed modifications.  
So I will simply select the same number of unmodified texts as we have modified texts.

In [None]:
random.seed(RANDOM_SEED)

working_set_dir = DATA_DIR / "working_set"

evaluation_set_dir = working_set_dir / "evaluation_set"
fine_tuning_set_dir = working_set_dir / "fine_tuning_set"

# Clear the directory if it exists
if evaluation_set_dir.exists():
    shutil.rmtree(evaluation_set_dir)
if fine_tuning_set_dir.exists():
    shutil.rmtree(fine_tuning_set_dir)

# Recreate them
evaluation_set_dir.mkdir()
fine_tuning_set_dir.mkdir()

# Grab our evaluated texts
evaluated_ai_modified_texts_files = []
for file in evaluated_ai_modified_texts_dir.iterdir():
    evaluated_ai_modified_texts_files.append(file)
evaluated_unmodified_texts_files = []
for file in evaluated_unmodified_texts_dir.iterdir():
    evaluated_unmodified_texts_files.append(file)

# Trim down the unmodified texts to be the same length as the AI modified texts
# Or just trim the longer one to the shorter one
num_data = len(evaluated_ai_modified_texts_files)
if len(evaluated_unmodified_texts_files) < num_data:
    print("Number of unmodified texts is less than the AI modified texts")

evaluated_ai_modified_texts_files = random.sample(evaluated_ai_modified_texts_files, num_data)
evaluated_unmodified_texts_files = random.sample(evaluated_unmodified_texts_files, num_data)

# Now we select 20% to be used as evaluation data
_20_percent = int(num_data * 0.2)
evaluation_data_files = random.sample(evaluated_ai_modified_texts_files, _20_percent)
evaluation_data_files += random.sample(evaluated_unmodified_texts_files, _20_percent)

fine_tuning_data_files = [file for file in evaluated_ai_modified_texts_files + evaluated_unmodified_texts_files
                          if file not in evaluation_data_files]

# Finally we save these files to our new directories
# With a little renaming so we as the users can still read things
for file in evaluation_data_files:
    _type = ""
    if "ai_modified_texts" in file.parts:
        _type = "AI Modified"
    else:
        _type = "Unmodified"

    new_location = evaluation_set_dir / f"{file.stem} - {_type}.json"

    shutil.copy(file, new_location)

for file in fine_tuning_data_files:
    _type = ""
    if "ai_modified_texts" in file.parts:
        _type = "AI Modified"
    else:
        _type = "Unmodified"

    new_location = fine_tuning_set_dir / f"{file.stem} - {_type}.json"

    shutil.copy(file, new_location)

print("Done")