# **Visualisations**

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

# Data

## Import Data

In [None]:
# Define the expected keys and structure
EXPECTED_KEYS = {"log_probs_base", "log_probs_transformed", "discrepancy_scores"}

def load_and_validate(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    # Check that the JSON object has exactly the three expected keys
    if set(data.keys()) != EXPECTED_KEYS:
        raise ValueError(f"Expected keys {EXPECTED_KEYS}, but got {set(data.keys())}")
    # Validate lengths for each list
    if len(data["log_probs_base"]) != 200:
        raise ValueError("log_probs_base should contain 200 floats")
    if len(data["log_probs_transformed"]) != 200:
        raise ValueError("log_probs_transformed should contain 200 lists")
    if len(data["discrepancy_scores"]) != 200:
        raise ValueError("discrepancy_scores should contain 200 floats")
    # Validate each inner list in log_probs_transformed has 100 floats
    for idx, inner_list in enumerate(data["log_probs_transformed"]):
        if len(inner_list) != 100:
            raise ValueError(f"Inner list at index {idx} in log_probs_transformed does not contain 100 floats")
    return data

# -------------------------------------------------------------------
# Generated by humans
human_computed_by_gpt_j_dct = load_and_validate(
    "/workspace/Results/XSUM_Human__200_Samples__200_Max_Length__100_Perturbations__Generated_By_Human__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-j-6B__20250319_103854.jsonl"
)

human_computed_by_gpt_neo_dct = load_and_validate(
    "/workspace/Results/XSUM_Human__200_Samples__200_Max_Length__100_Perturbations__Generated_By_Human__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-neo-2.7B__20250319_114542.jsonl"
)

human_computed_by_gpt_2_dct = load_and_validate(
    "/workspace/Results/XSUM_Human__200_Samples__200_Max_Length__100_Perturbations__Generated_By_Human__Perturbed_By_google-t5/t5-3b__Scored_By_openai-community/gpt2__20250319_120520.jsonl"
)


# -------------------------------------------------------------------
#  Generated by GPT-J
ai_generated_by_gpt_j_computed_by_gpt_j_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_EleutherAI/gpt-j-6B__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-j-6B__20250318_175629.jsonl"
)

ai_generated_by_gpt_j_computed_by_gpt_neo_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_EleutherAI/gpt-j-6B__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-neo-2.7B__20250318_183320.jsonl"
)

ai_generated_by_gpt_j_computed_by_gpt_2_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_EleutherAI/gpt-j-6B__Perturbed_By_google-t5/t5-3b__Scored_By_openai-community/gpt2__20250318_183732.jsonl"
)


# -------------------------------------------------------------------
#  Generated by GPT-Neo
ai_generated_by_gpt_neo_computed_by_gpt_j_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_EleutherAI/gpt-neo-2.7B__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-j-6B__20250318_202111.jsonl"
)

ai_generated_by_gpt_neo_computed_by_gpt_neo_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_EleutherAI/gpt-neo-2.7B__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-neo-2.7B__20250318_205825.jsonl"
)

ai_generated_by_gpt_neo_computed_by_gpt_2_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_EleutherAI/gpt-neo-2.7B__Perturbed_By_google-t5/t5-3b__Scored_By_openai-community/gpt2__20250318_210233.jsonl"
)


# -------------------------------------------------------------------
#  Generated by GPT-2
ai_generated_by_gpt_2_computed_by_gpt_j_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_openai-community/gpt2__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-j-6B__20250319_112531.jsonl"
)

ai_generated_by_gpt_2_computed_by_gpt_neo_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_openai-community/gpt2__Perturbed_By_google-t5/t5-3b__Scored_By_EleutherAI/gpt-neo-2.7B__20250319_120321.jsonl"
)

ai_generated_by_gpt_2_computed_by_gpt_2_dct = load_and_validate(
    "/workspace/Results/XSUM_AI__200_Samples__200_Max_Length__100_Perturbations__Generated_By_openai-community/gpt2__Perturbed_By_google-t5/t5-3b__Scored_By_openai-community/gpt2__20250319_120717.jsonl"
)

# Calculate DetectGPT Discrepancies

## DetectGPT Discrepancy Calculation Function

In [None]:
def compute_detectgpt_discrepancy(log_probs_per_text_base: list,
                                log_probs_per_text_transformed: list,
                                normalization: bool=False) -> list:
    """
    Compute the DetectGPT discrepancy metric for each of the n_samples texts. Computed for n_perturbations perturbations.

    Args:
        log_probs_per_text_base (list): original log probability of each text
        log_probs_per_text_transformed (list): list of size n_samples where each element is a list of the n_perturbations perturbed log probs
        normalization (bool): True if you want to normalize the discrepancy scores, False otherwise

    Returns:
        discrepancy_scores (list): list of discrepancy values (d) for the n_samples texts
    """
    n_samples = len(log_probs_per_text_base) 
    discrepancy_scores = []

    for i in range(n_samples):
        original_log_prob = log_probs_per_text_base[i]
        perturbed_log_probs = log_probs_per_text_transformed[i] # List of perturbed log probs

        # Compute mean log probability of the perturbed texts
        mu = np.mean(perturbed_log_probs)  

        # Compute discrepancy
        discrepancy_score_unormalized = original_log_prob - mu
        if normalization:
            # Normalize
            sigma = np.std(perturbed_log_probs)
            discrepancy_score_normalized = discrepancy_score_unormalized / sigma if sigma > 0 else discrepancy_score_unormalized
            discrepancy_scores.append(discrepancy_score_normalized)
        else:
            discrepancy_scores.append(discrepancy_score_unormalized)
    
    return discrepancy_scores

## Calculate Discrepancies

### 1) GPT-J as Scorer

Set Model

In [None]:
CACHE_DIR = "/tmp/huggingface"
COMPUTATION_MODEL_NAME = "EleutherAI/gpt-j-6B"
# Model list (all tested)

# openai-community/gpt2
# openai-community/gpt2-medium
# openai-community/gpt2-large
# openai-community/gpt2-xl

# EleutherAI/gpt-neo-2.7B
# EleutherAI/gpt-j-6B
# EleutherAI/gpt-neox-20b

computation_model_kwargs = {}
if 'gpt-j' in COMPUTATION_MODEL_NAME or 'neox' in COMPUTATION_MODEL_NAME:
    computation_model_kwargs.update(dict(torch_dtype=torch.float16))
if 'gpt-j' in COMPUTATION_MODEL_NAME:
    computation_model_kwargs.update(dict(revision='float16'))

# Load model
computation_model = AutoModelForCausalLM.from_pretrained(COMPUTATION_MODEL_NAME, **computation_model_kwargs, cache_dir=CACHE_DIR)

# Load tokenizer 
computation_tokenizer = AutoTokenizer.from_pretrained(COMPUTATION_MODEL_NAME, cache_dir=CACHE_DIR)

computation_tokenizer.model_max_length = 1024 

if computation_tokenizer.pad_token is None:
    computation_tokenizer.pad_token = computation_tokenizer.eos_token
computation_tokenizer.pad_token_id = computation_tokenizer.eos_token_id

computation_model.to(DEVICE)
print(DEVICE)

Calculate with Model

In [None]:
human_computed_by_gpt_j = compute_detectgpt_discrepancy(human_computed_by_gpt_j_dct["log_probs_base"], human_computed_by_gpt_j_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_j_computed_by_gpt_j = compute_detectgpt_discrepancy(ai_generated_by_gpt_j_computed_by_gpt_j_dct["log_probs_base"], ai_generated_by_gpt_j_computed_by_gpt_j_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_neo_computed_by_gpt_j = compute_detectgpt_discrepancy(ai_generated_by_gpt_neo_computed_by_gpt_j_dct["log_probs_base"], ai_generated_by_gpt_neo_computed_by_gpt_j_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_2_computed_by_gpt_j = compute_detectgpt_discrepancy(ai_generated_by_gpt_2_computed_by_gpt_j_dct["log_probs_base"], ai_generated_by_gpt_2_computed_by_gpt_j_dct["log_probs_transformed"], normalization=True)

### 2) GPT-Neo as Scorer

Set Model

In [None]:
CACHE_DIR = "/tmp/huggingface"
COMPUTATION_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"
# Model list (all tested)

# openai-community/gpt2
# openai-community/gpt2-medium
# openai-community/gpt2-large
# openai-community/gpt2-xl

# EleutherAI/gpt-neo-2.7B
# EleutherAI/gpt-j-6B
# EleutherAI/gpt-neox-20b

computation_model_kwargs = {}
if 'gpt-j' in COMPUTATION_MODEL_NAME or 'neox' in COMPUTATION_MODEL_NAME:
    computation_model_kwargs.update(dict(torch_dtype=torch.float16))
if 'gpt-j' in COMPUTATION_MODEL_NAME:
    computation_model_kwargs.update(dict(revision='float16'))

# Load model
computation_model = AutoModelForCausalLM.from_pretrained(COMPUTATION_MODEL_NAME, **computation_model_kwargs, cache_dir=CACHE_DIR)

# Load tokenizer 
computation_tokenizer = AutoTokenizer.from_pretrained(COMPUTATION_MODEL_NAME, cache_dir=CACHE_DIR)

computation_tokenizer.model_max_length = 1024 

if computation_tokenizer.pad_token is None:
    computation_tokenizer.pad_token = computation_tokenizer.eos_token
computation_tokenizer.pad_token_id = computation_tokenizer.eos_token_id

computation_model.to(DEVICE)
print(DEVICE)

Calculate with Model

In [None]:
human_computed_by_gpt_neo = compute_detectgpt_discrepancy(human_computed_by_gpt_neo_dct["log_probs_base"], human_computed_by_gpt_neo_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_j_computed_by_gpt_neo = compute_detectgpt_discrepancy(ai_generated_by_gpt_j_computed_by_gpt_neo_dct["log_probs_base"], ai_generated_by_gpt_j_computed_by_gpt_neo_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_neo_computed_by_gpt_neo = compute_detectgpt_discrepancy(ai_generated_by_gpt_neo_computed_by_gpt_neo_dct["log_probs_base"], ai_generated_by_gpt_neo_computed_by_gpt_neo_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_2_computed_by_gpt_neo = compute_detectgpt_discrepancy(ai_generated_by_gpt_2_computed_by_gpt_neo_dct["log_probs_base"], ai_generated_by_gpt_2_computed_by_gpt_neo_dct["log_probs_transformed"], normalization=True)

### 3) GPT-2 as Scorer

Set Model

In [None]:
CACHE_DIR = "/tmp/huggingface"
COMPUTATION_MODEL_NAME = "openai-community/gpt2"
# Model list (all tested)

# openai-community/gpt2
# openai-community/gpt2-medium
# openai-community/gpt2-large
# openai-community/gpt2-xl

# EleutherAI/gpt-neo-2.7B
# EleutherAI/gpt-j-6B
# EleutherAI/gpt-neox-20b

computation_model_kwargs = {}
if 'gpt-j' in COMPUTATION_MODEL_NAME or 'neox' in COMPUTATION_MODEL_NAME:
    computation_model_kwargs.update(dict(torch_dtype=torch.float16))
if 'gpt-j' in COMPUTATION_MODEL_NAME:
    computation_model_kwargs.update(dict(revision='float16'))

# Load model
computation_model = AutoModelForCausalLM.from_pretrained(COMPUTATION_MODEL_NAME, **computation_model_kwargs, cache_dir=CACHE_DIR)

# Load tokenizer 
computation_tokenizer = AutoTokenizer.from_pretrained(COMPUTATION_MODEL_NAME, cache_dir=CACHE_DIR)

computation_tokenizer.model_max_length = 1024 

if computation_tokenizer.pad_token is None:
    computation_tokenizer.pad_token = computation_tokenizer.eos_token
computation_tokenizer.pad_token_id = computation_tokenizer.eos_token_id

computation_model.to(DEVICE)
print(DEVICE)

Calculate with Model

In [None]:
human_computed_by_gpt_2 = compute_detectgpt_discrepancy(human_computed_by_gpt_2_dct["log_probs_base"], human_computed_by_gpt_2_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_j_computed_by_gpt_2 = compute_detectgpt_discrepancy(ai_generated_by_gpt_j_computed_by_gpt_2_dct["log_probs_base"], ai_generated_by_gpt_j_computed_by_gpt_2_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_neo_computed_by_gpt_2 = compute_detectgpt_discrepancy(ai_generated_by_gpt_neo_computed_by_gpt_2_dct["log_probs_base"], ai_generated_by_gpt_neo_computed_by_gpt_2_dct["log_probs_transformed"], normalization=True)

ai_generated_by_gpt_2_computed_by_gpt_2 = compute_detectgpt_discrepancy(ai_generated_by_gpt_2_computed_by_gpt_2_dct["log_probs_base"], ai_generated_by_gpt_2_computed_by_gpt_2_dct["log_probs_transformed"], normalization=True)

## Organise Discrepancies

In [None]:
scores_list = [
    human_computed_by_gpt_j, ai_generated_by_gpt_j_computed_by_gpt_j,   # Top-left
    human_computed_by_gpt_neo, ai_generated_by_gpt_j_computed_by_gpt_neo, # Top-middle
    human_computed_by_gpt_2, ai_generated_by_gpt_j_computed_by_gpt_2,   # Top-right
    human_computed_by_gpt_j, ai_generated_by_gpt_neo_computed_by_gpt_j, # Middle-left
    human_computed_by_gpt_neo, ai_generated_by_gpt_neo_computed_by_gpt_neo, # Middle
    human_computed_by_gpt_2, ai_generated_by_gpt_neo_computed_by_gpt_2, # Middle-right
    human_computed_by_gpt_j, ai_generated_by_gpt_2_computed_by_gpt_j,   # Bottom-left
    human_computed_by_gpt_neo, ai_generated_by_gpt_2_computed_by_gpt_neo, # Bottom-middle
    human_computed_by_gpt_2, ai_generated_by_gpt_2_computed_by_gpt_2    # Bottom-right
    ]

# Functions

## AUROC Calculation

In [None]:
def get_roc_metrics(discrepancy_scores_human, discrepancy_scores_ai):
    fpr, tpr, _ = roc_curve(
    np.array([0] * len(discrepancy_scores_human) + [1] * len(discrepancy_scores_ai)),
    np.concatenate([discrepancy_scores_human, discrepancy_scores_ai])
    )
    roc_auc = auc(fpr, tpr)
    return fpr.tolist(), tpr.tolist(), float(roc_auc)

## Histogram Grid

In [None]:
def plot_histogram_grid():
    fig, axes = plt.subplots(3, 3, figsize=(12, 12))
    fig.suptitle('Histograms of Discrepancy Scores', fontsize=24, fontweight='bold', y=1.04)  # Title further away

    row_titles = ["GPT-J", "GPT-Neo", "GPT-2"]
    col_titles = ["GPT-J", "GPT-Neo", "GPT-2"]
    
    for i, ax in enumerate(axes.flat):
        human_scores = scores_list[i * 2]
        ai_scores = scores_list[i * 2 + 1]

        fpr,tpr,roc_auc = get_roc_metrics(human_scores, ai_scores)

        ax.hist(human_scores, bins='auto', alpha=0.5, label='Human', edgecolor='black')
        ax.hist(ai_scores, bins='auto', alpha=0.5, label='AI', edgecolor='black')


        ax.set_title(f'AUROC {roc_auc:.2f}')

        if i % 3 == 0:
            ax.set_ylabel('Frequency')
            ax.text(-0.25, 0.5, row_titles[i // 3], transform=ax.transAxes, fontsize=16, fontweight = 'bold',
                    rotation=90, verticalalignment='center', horizontalalignment='center')

        if i < 3:
            ax.text(0.5, 1.15, col_titles[i], transform=ax.transAxes, fontsize=16, fontweight = 'bold',
                    rotation='horizontal', verticalalignment='bottom', horizontalalignment='center')

        if i >= 6:
            ax.set_xlabel('Discrepancy Score')
        
        ax.legend()
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    
    fig.text(0.5, 0.95, 'Scoring Model', ha='center', fontsize=20, fontweight = 'bold')  # Centered relative to grid
    fig.text(-0.06, 0.5, 'Generating Model', va='center', rotation='vertical', fontsize=20, fontweight = 'bold')  # Equidistant from grid

    plt.show()

## AUROC Grid

In [None]:
def plot_auroc_grid():
    aurocs = [get_roc_metrics(scores_list[i * 2], scores_list[i * 2 + 1])[2] for i in range(9)]
    
    fig, ax = plt.subplots(figsize=(6, 6))
    
    auroc_matrix = np.array(aurocs).reshape(3, 3)
    sns.heatmap(auroc_matrix, annot=True, fmt='.2f', cmap='Blues', linewidths=0.5, ax=ax, cbar_kws={'label': 'AUROC'})

    # Set row and column labels
    row_titles = ["GPT-J", "GPT-Neo", "GPT-2"]
    col_titles = ["GPT-J", "GPT-Neo", "GPT-2"]

    ax.set_xticklabels(col_titles, fontsize=12, fontweight='bold', ha='center', rotation=0)
    ax.xaxis.set_label_position('top')  # Move x-axis label (column titles) to the top
    ax.xaxis.tick_top()  # Ensure ticks and labels are on top

    ax.set_yticklabels(row_titles, fontsize=12, fontweight='bold', va='center', rotation=90)
    
    # Set main title
    fig.suptitle('Grid of AUROCs', fontsize=16, fontweight='bold', y=1.08)

    # Set 'Scoring Model' and 'Generating Model' labels
    fig.text(0.5, 0.96, 'Scoring Model', ha='center', fontsize=14, fontweight='bold')
    fig.text(0, 0.5, 'Generating Model', va='center', rotation='vertical', fontsize=14, fontweight='bold')

    plt.show()

# Execute Code

In [None]:
plot_histogram_grid()
plot_auroc_grid()

# **Ensemble Methods**

## Basic Statistics

### Mean

Simple Mean

In [None]:
# Calculate the arithmetic mean of the scores for the human text
human_scores = np.array(scores_list[0::2])
human_scores_mean = np.mean(human_scores)

# Check whether any of the lists of scores for the GPT-J generated texts are the same (note that each entry is a list)
assert not np.allclose(scores_list[1], scores_list[3], scores_list[5])
# Calculate the arithmetic mean of the scores for the GPT-J generated text (entries 1, 3, 5)
gpt_j_scores = np.array([scores_list[1], scores_list[3], scores_list[5]])
gpt_j_scores_mean = np.mean(gpt_j_scores)

# Check whether any of the lists of scores for the GPT-Neo generated texts are the same (note that each entry is a list)
assert not np.allclose(scores_list[7], scores_list[9], scores_list[11])
# Calculate the arithmetic mean of the scores for the GPT-Neo generated text (entries 7, 9, 11)
gpt_neo_scores = np.array([scores_list[7], scores_list[9], scores_list[11]])
gpt_neo_scores_mean = np.mean(gpt_neo_scores)

# Check whether any of the lists of scores for the GPT-2 generated texts are the same (note that each entry is a list)
assert not np.allclose(scores_list[13], scores_list[15], scores_list[17])
# Calculate the arithmetic mean of the scores for the GPT-2 generated text (entries 13, 15, 17)
gpt_2_scores = np.array([scores_list[13], scores_list[15], scores_list[17]])
gpt_2_scores_mean = np.mean(gpt_2_scores)

Mean Excluding Base Model

In [None]:
# These calculate the mean, ignoring the results where the generating model is used as a scoring model

human_scores_no_base = human_scores
human_scores_no_base_mean = np.mean(human_scores_no_base)

gpt_j_scores_no_base = np.array([scores_list[3], scores_list[5]])
gpt_j_scores_no_base_mean = np.mean(gpt_j_scores_no_base)

gpt_neo_scores_no_base = np.array([scores_list[7], scores_list[11]])
gpt_neo_scores_no_base_mean = np.mean(gpt_neo_scores_no_base)

gpt_2_scores_no_base = np.array([scores_list[13], scores_list[15])
gpt_2_scores_no_base_mean = np.mean(gpt_2_scores_no_base)

### Median

Simple Median

In [None]:
# Repeat the above code, but find the median instead of the mean

human_scores_median = np.median(human_scores)

gpt_j_scores_median = np.median(gpt_j_scores)

gpt_neo_scores_median = np.median(gpt_neo_scores)

gpt_2_scores_median = np.median(gpt_2_scores)

Median Excluding Base Model

In [None]:
# These calculate the median, ignoring the results where the generating model is used as a scoring model

human_scores_no_base_mean = np.median(human_scores_no_base)

gpt_j_scores_no_base_mean = np.median(gpt_j_scores_no_base)

gpt_neo_scores_no_base_mean = np.median(gpt_neo_scores_no_base)

gpt_2_scores_no_base_mean = np.median(gpt_2_scores_no_base)

### Maximum

Simple Maximum

In [None]:
# Repeat the above code, but find the maximum instead of the mean

human_scores_max = np.max(human_scores)

gpt_j_scores_max = np.max(gpt_j_scores)

gpt_neo_scores_max = np.max(gpt_neo_scores)

gpt_2_scores_max = np.max(gpt_2_scores)

Maximum Excluding Base Model

In [None]:
# These calculate the mean, ignoring the results where the generating model is used as a scoring model

human_scores_no_base_mean = np.max(human_scores_no_base)

gpt_j_scores_no_base_mean = np.max(gpt_j_scores_no_base)

gpt_neo_scores_no_base_mean = np.max(gpt_neo_scores_no_base)

gpt_2_scores_no_base_mean = np.max(gpt_2_scores_no_base)