In [None]:
import os
import warnings
from collections import defaultdict

import pandas as pd
import torch
from dotenv import load_dotenv
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch import loggers as pl_loggers
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from tqdm.auto import tqdm, trange

load_dotenv("../env")

warnings.filterwarnings("ignore", ".*does not have many workers.*")

In [2]:
from luminar.document.data import (
    FeatureDataset,
    PaddingDataloader,
    n_way_split,
)
from luminar.document.model import ConvolutionalLayerSpec, DocumentClassficationModel
from luminar.features import FeatureExtractor, OneDimFeatures, Slicer, TwoDimFeatures
from luminar.mongo import MongoFindDataset

In [3]:
config = {
    "seed": 42,
    "eval_split": 0.1,
    "test_split": 0.2,
    # "feature_model": "meta-llama/Llama-3.2-1B",
    "feature_model": "gpt2",
}

In [4]:
agents = [
    "human",
    "claude",
    "gpt",
    "gpt_prompt1",
    "gpt_prompt2",
    "gpt_semantic",
    "gpt_writing",
]

In [45]:
# feature_dim = OneDimFeatures(128)
# featurizer = FeatureExtractor.Likelihood()
# config["second_dim_as_channels"] = False
feature_dim = TwoDimFeatures(256, 13)
featurizer = FeatureExtractor.IntermediateLogits(13)
config["second_dim_as_channels"] = True

# slicer = Slicer.First(feature_dim[0])
# slicer = Slicer.Random(feature_dim[0])
slicer = Slicer.RandomMultiple(feature_dim[0] // 4, multiple=4, stride=16)
# slicer = Slicer.RandomMultiple(feature_dim[0] // 4, 4)

config["feature_dim"] = feature_dim
config["featurizer"] = repr(featurizer)
config["slicer"] = repr(slicer)

config["num_samples"] = None


def featurize(dataset) -> FeatureDataset:
    return FeatureDataset(
        tqdm(dataset, desc="Featurizing", leave=False),
        slicer,
        featurizer,
        num_samples=config["num_samples"],
        label_field="label",
        label_zero="human",
    )

In [46]:
config["seed"] = 42

sizes = [0.1] * 10
ai_splits = {}
for agent in tqdm(["human", "gpt"], desc="Domains", position=0):
# for agent in tqdm(agents, desc="Domains", position=0):
    seed_everything(config["seed"])
    ai_splits[agent] = [
        featurize(subset)
        for subset in n_way_split(
            MongoFindDataset(
                {"document.agent": agent, "model.name": config["feature_model"]},
                projection={
                    "_id": 1,
                    "features": [
                        {
                            "label": "$document.label",
                            "agent": "$document.agent",
                            "type": "$document.type",
                            "split": "$split",
                            "transition_scores": "$transition_scores",
                        }
                    ],
                },
                mongo_db_connection=os.environ.get("MONGO_DB_CONNECTION"),
                collection="features_Ghostbuster",
            ).load(),
            *sizes,
        )
    ]

Domains:   0%|          | 0/2 [00:00<?, ?it/s]

Seed set to 42


[MongoFindDataset] Loading Data from Cache File /nvme/.cache/luminar/2fc8f94775268bb36929501eeb2b22785746a2d838056230e87e68f6c8395ca6.pkl


Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/199 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/199 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/199 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/199 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/199 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/199 [00:00<?, ?it/s]

Seed set to 42


[MongoFindDataset] Loading Data from Cache File /nvme/.cache/luminar/ba86eacbce10c7dc12309de1b948eb81c824d0d941c989b59968f834c2199be8.pkl


Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing:   0%|          | 0/200 [00:00<?, ?it/s]

In [47]:
human_subsets = ai_splits.pop("human")

## In-Domain Training & Evaluation

In [48]:
from torch.utils.data import ConcatDataset

config |= {
    "batch_size": 32,
}


# capturing config from "closure"
def get_dataloader(*dataset, **kwargs) -> PaddingDataloader:
    if len(dataset) == 1:
        dataset = dataset[0]
    else:
        dataset = ConcatDataset(dataset)
    return PaddingDataloader(
        dataset,
        feature_dim=config["feature_dim"],
        batch_size=config["batch_size"],
        **kwargs,
    )

In [49]:
config |= {
    "projection_dim": 32,
    "learning_rate": 0.0001,
    "warmup_steps": 80,
    "max_epochs": 50,
    "gradient_clip_val": 1.0,
}

# SeqXGPT Layer Configuration
config["conv_layer_shapes"] = [
    ConvolutionalLayerSpec(64, 5),
    *[ConvolutionalLayerSpec(128, 3)],
    ConvolutionalLayerSpec(64, 3),
]

In [50]:
ai_subsets = ai_splits[agent]
seed_everything(config["seed"])

# human_subsets.insert(0, human_subsets.pop())
# ai_subsets.insert(0, ai_subsets.pop())

eval_dataloader = get_dataloader(human_subsets[0], ai_subsets[0])
test_dataloader = get_dataloader(*human_subsets[1:3], *ai_subsets[1:3])
train_dataloader = get_dataloader(*human_subsets[3:], *ai_subsets[3:], shuffle=True)

model = DocumentClassficationModel(**config)
trainer = Trainer(
    max_epochs=config["max_epochs"],
    logger=pl_loggers.TensorBoardLogger(
        save_dir=f"logs/Ghostbuster/in_domain/{type(featurizer).__name__}",
        name=agent,
    ),
    gradient_clip_val=config["gradient_clip_val"],
    callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=10)],
    deterministic=True,
)
trainer.fit(
    model,
    train_dataloaders=train_dataloader,
    val_dataloaders=eval_dataloader,
)
(metrics,) = trainer.test(model, test_dataloader, verbose=False)
pd.DataFrame([metrics])

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 53.6 K | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
577 K     Trainable params
0         Non-trainable params
577 K     Total params
2.312     Total estimated model params size (MB)
13        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

Unnamed: 0,test_loss,test_acc@0.5,test_f1@0.5,test_acc@best,test_f1@best,test_roc_auc
0,0.346679,0.855,0.851282,0.83,0.844749,0.931813


In [None]:

results_in_domain = defaultdict(list)
for agent, ai_subsets in tqdm(ai_splits.items()):
    for _ in trange(5, desc=agent, position=1):
        seed_everything(config["seed"])

        human_subsets.insert(0, human_subsets.pop())
        ai_subsets.insert(0, ai_subsets.pop())
        eval_dataloader = get_dataloader(human_subsets[0], ai_subsets[0])
        test_dataloader = get_dataloader(*human_subsets[1:3], *ai_subsets[1:3])
        train_dataloader = get_dataloader(
            *human_subsets[3:], *ai_subsets[3:], shuffle=True
        )

        model = DocumentClassficationModel(**config)
        trainer = Trainer(
            max_epochs=config["max_epochs"],
            logger=pl_loggers.TensorBoardLogger(
                save_dir=f"logs/Ghostbuster/in_domain/{type(featurizer).__name__}",
                name=agent,
            ),
            gradient_clip_val=config["gradient_clip_val"],
            callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)],
            deterministic=True,
        )
        trainer.progress_bar_callback.disable()

        trainer.fit(
            model,
            train_dataloaders=train_dataloader,
            val_dataloaders=eval_dataloader,
        )
        (metrics,) = trainer.test(model, test_dataloader, verbose=False)
        results_in_domain[agent].append(metrics)

        # metrics = []
        # for other, subsets in splits.items():
        #     trainer.validate(model, get_dataloader(subsets[-1]), verbose=False)
        #     metrics.append(
        #         {"other": other}
        #         | trainer.test(model, get_dataloader(*subsets[:2]), verbose=False)[0]
        #     )
        # results_in_domain[agent].append(metrics)

        print(agent, metrics)


  0%|          | 0/6 [00:00<?, ?it/s]

claude:   0%|          | 0/5 [00:00<?, ?it/s]

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules 

claude {'test_loss': 0.1087355762720108, 'test_acc@0.5': 0.9662500023841858, 'test_f1@0.5': 0.9663760662078857, 'test_acc@best': 0.9725000262260437, 'test_f1@best': 0.9723618030548096, 'test_roc_auc': 0.994446873664856}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

claude {'test_loss': 0.14335134625434875, 'test_acc@0.5': 0.9511889815330505, 'test_f1@0.5': 0.9520295262336731, 'test_acc@best': 0.9624530673027039, 'test_f1@best': 0.9624060392379761, 'test_roc_auc': 0.9910212755203247}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

claude {'test_loss': 0.16900520026683807, 'test_acc@0.5': 0.9598997235298157, 'test_f1@0.5': 0.9595959782600403, 'test_acc@best': 0.9523809552192688, 'test_f1@best': 0.9528536200523376, 'test_roc_auc': 0.984698474407196}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

claude {'test_loss': 0.5317082405090332, 'test_acc@0.5': 0.9160401225090027, 'test_f1@0.5': 0.9201430082321167, 'test_acc@best': 0.9373433589935303, 'test_f1@best': 0.9382715821266174, 'test_roc_auc': 0.9760050177574158}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

claude {'test_loss': 0.11776493489742279, 'test_acc@0.5': 0.9573934674263, 'test_f1@0.5': 0.9561855792999268, 'test_acc@best': 0.9624060392379761, 'test_f1@best': 0.9622166156768799, 'test_roc_auc': 0.990207314491272}


gpt:   0%|          | 0/5 [00:00<?, ?it/s]

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules 

gpt {'test_loss': 0.4585123658180237, 'test_acc@0.5': 0.7882205247879028, 'test_f1@0.5': 0.8219178318977356, 'test_acc@best': 0.878446102142334, 'test_f1@best': 0.8779874444007874, 'test_roc_auc': 0.9533103108406067}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt {'test_loss': 0.4053586423397064, 'test_acc@0.5': 0.8433583974838257, 'test_f1@0.5': 0.8294679522514343, 'test_acc@best': 0.8395990133285522, 'test_f1@best': 0.8476190567016602, 'test_roc_auc': 0.9352763891220093}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt {'test_loss': 0.440337598323822, 'test_acc@0.5': 0.822277843952179, 'test_f1@0.5': 0.8422222137451172, 'test_acc@best': 0.8598247766494751, 'test_f1@best': 0.8585858345031738, 'test_roc_auc': 0.9325438737869263}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt {'test_loss': 0.4874598979949951, 'test_acc@0.5': 0.8100000023841858, 'test_f1@0.5': 0.7764706015586853, 'test_acc@best': 0.8399999737739563, 'test_f1@best': 0.8419753313064575, 'test_roc_auc': 0.9255437254905701}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt {'test_loss': 0.4792071282863617, 'test_acc@0.5': 0.8050000071525574, 'test_f1@0.5': 0.7671641707420349, 'test_acc@best': 0.8662499785423279, 'test_f1@best': 0.8654087781906128, 'test_roc_auc': 0.9381062388420105}


gpt_prompt1:   0%|          | 0/5 [00:00<?, ?it/s]

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules 

gpt_prompt1 {'test_loss': 0.35296154022216797, 'test_acc@0.5': 0.8550000190734863, 'test_f1@0.5': 0.8690744638442993, 'test_acc@best': 0.8824999928474426, 'test_f1@best': 0.881313145160675, 'test_roc_auc': 0.9500062465667725}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt1 {'test_loss': 0.6446144580841064, 'test_acc@0.5': 0.7434293031692505, 'test_f1@0.5': 0.6677471399307251, 'test_acc@best': 0.8735920190811157, 'test_f1@best': 0.8710089325904846, 'test_roc_auc': 0.9291541576385498}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt1 {'test_loss': 0.4005458652973175, 'test_acc@0.5': 0.8521303534507751, 'test_f1@0.5': 0.8608490824699402, 'test_acc@best': 0.8634085059165955, 'test_f1@best': 0.8614993691444397, 'test_roc_auc': 0.9269598126411438}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt1 {'test_loss': 0.2967509925365448, 'test_acc@0.5': 0.8809523582458496, 'test_f1@0.5': 0.8786717653274536, 'test_acc@best': 0.8796992301940918, 'test_f1@best': 0.877237856388092, 'test_roc_auc': 0.9495100378990173}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt1 {'test_loss': 0.3610204756259918, 'test_acc@0.5': 0.8496240377426147, 'test_f1@0.5': 0.8314606547355652, 'test_acc@best': 0.8897243142127991, 'test_f1@best': 0.8913580179214478, 'test_roc_auc': 0.9577889442443848}


gpt_prompt2:   0%|          | 0/5 [00:00<?, ?it/s]

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules 

gpt_prompt2 {'test_loss': 0.36569640040397644, 'test_acc@0.5': 0.8521303534507751, 'test_f1@0.5': 0.8374655842781067, 'test_acc@best': 0.8659147620201111, 'test_f1@best': 0.8690330386161804, 'test_roc_auc': 0.9448366761207581}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt2 {'test_loss': 0.35692325234413147, 'test_acc@0.5': 0.8421052694320679, 'test_f1@0.5': 0.8538283109664917, 'test_acc@best': 0.8571428656578064, 'test_f1@best': 0.8575000166893005, 'test_roc_auc': 0.9291268587112427}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt2 {'test_loss': 0.3661993741989136, 'test_acc@0.5': 0.8410513401031494, 'test_f1@0.5': 0.8558456301689148, 'test_acc@best': 0.8723404407501221, 'test_f1@best': 0.8746928572654724, 'test_roc_auc': 0.9375689029693604}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt2 {'test_loss': 0.4033776819705963, 'test_acc@0.5': 0.856249988079071, 'test_f1@0.5': 0.8476821184158325, 'test_acc@best': 0.8600000143051147, 'test_f1@best': 0.8653846383094788, 'test_roc_auc': 0.9301312565803528}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_prompt2 {'test_loss': 0.401864230632782, 'test_acc@0.5': 0.8212500214576721, 'test_f1@0.5': 0.8395061492919922, 'test_acc@best': 0.8399999737739563, 'test_f1@best': 0.8501170873641968, 'test_roc_auc': 0.9347624778747559}


gpt_semantic:   0%|          | 0/5 [00:00<?, ?it/s]

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules 

gpt_semantic {'test_loss': 0.27327337861061096, 'test_acc@0.5': 0.8924999833106995, 'test_f1@0.5': 0.8938271403312683, 'test_acc@best': 0.893750011920929, 'test_f1@best': 0.8982036113739014, 'test_roc_auc': 0.9562875032424927}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_semantic {'test_loss': 0.2824324071407318, 'test_acc@0.5': 0.8936170339584351, 'test_f1@0.5': 0.9001175165176392, 'test_acc@best': 0.9011263847351074, 'test_f1@best': 0.9035409092903137, 'test_roc_auc': 0.962274432182312}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_semantic {'test_loss': 0.2922516465187073, 'test_acc@0.5': 0.88345867395401, 'test_f1@0.5': 0.8768212199211121, 'test_acc@best': 0.8696742057800293, 'test_f1@best': 0.8820861577987671, 'test_roc_auc': 0.9593467116355896}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_semantic {'test_loss': 0.2812647521495819, 'test_acc@0.5': 0.8796992301940918, 'test_f1@0.5': 0.8775510191917419, 'test_acc@best': 0.8771929740905762, 'test_f1@best': 0.8784118890762329, 'test_roc_auc': 0.9551758766174316}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_semantic {'test_loss': 0.4555540978908539, 'test_acc@0.5': 0.8283208012580872, 'test_f1@0.5': 0.8023087978363037, 'test_acc@best': 0.8859649300575256, 'test_f1@best': 0.8858218193054199, 'test_roc_auc': 0.9552261233329773}


gpt_writing:   0%|          | 0/5 [00:00<?, ?it/s]

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules 

gpt_writing {'test_loss': 0.2586323916912079, 'test_acc@0.5': 0.8847118020057678, 'test_f1@0.5': 0.8832487463951111, 'test_acc@best': 0.8872180581092834, 'test_f1@best': 0.8863636255264282, 'test_roc_auc': 0.9614133238792419}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_writing {'test_loss': 0.3856202960014343, 'test_acc@0.5': 0.8270676732063293, 'test_f1@0.5': 0.801152765750885, 'test_acc@best': 0.878446102142334, 'test_f1@best': 0.8857479095458984, 'test_roc_auc': 0.952305257320404}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_writing {'test_loss': 0.6687334775924683, 'test_acc@0.5': 0.7309136390686035, 'test_f1@0.5': 0.6386554837226868, 'test_acc@best': 0.8723404407501221, 'test_f1@best': 0.8728179335594177, 'test_roc_auc': 0.9510839581489563}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_writing {'test_loss': 0.4027191698551178, 'test_acc@0.5': 0.875, 'test_f1@0.5': 0.8714653253555298, 'test_acc@best': 0.8675000071525574, 'test_f1@best': 0.875, 'test_roc_auc': 0.9446125030517578}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

gpt_writing {'test_loss': 0.4400838315486908, 'test_acc@0.5': 0.862500011920929, 'test_f1@0.5': 0.8463687300682068, 'test_acc@best': 0.8887500166893005, 'test_f1@best': 0.8954171538352966, 'test_roc_auc': 0.9561562538146973}


In [11]:
import pandas as pd

df = pd.DataFrame(
    [
        {
            "agent": agent,
            **metric,
        }
        for agent, metrics in results_in_domain.items()
        for metric in metrics
    ]
)
df = df.groupby("agent").mean()
print(df.to_latex(float_format="\\np{%.3f}"))
df

\begin{tabular}{lrrrrrr}
\toprule
 & test_loss & test_acc@0.5 & test_f1@0.5 & test_acc@best & test_f1@best & test_roc_auc \\
agent &  &  &  &  &  &  \\
\midrule
claude & \np{0.214} & \np{0.950} & \np{0.951} & \np{0.957} & \np{0.958} & \np{0.987} \\
gpt & \np{0.454} & \np{0.814} & \np{0.807} & \np{0.857} & \np{0.858} & \np{0.937} \\
gpt_prompt1 & \np{0.411} & \np{0.836} & \np{0.822} & \np{0.878} & \np{0.876} & \np{0.943} \\
gpt_prompt2 & \np{0.379} & \np{0.843} & \np{0.847} & \np{0.859} & \np{0.863} & \np{0.935} \\
gpt_semantic & \np{0.317} & \np{0.876} & \np{0.870} & \np{0.886} & \np{0.890} & \np{0.958} \\
gpt_writing & \np{0.431} & \np{0.836} & \np{0.808} & \np{0.879} & \np{0.883} & \np{0.953} \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,test_loss,test_acc@0.5,test_f1@0.5,test_acc@best,test_f1@best,test_roc_auc
agent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
claude,0.214113,0.950154,0.950866,0.957417,0.957622,0.987276
gpt,0.454175,0.813771,0.807449,0.856824,0.858315,0.936956
gpt_prompt1,0.411179,0.836227,0.821561,0.877785,0.876483,0.942684
gpt_prompt2,0.378812,0.842557,0.846866,0.85908,0.863346,0.935285
gpt_semantic,0.316955,0.875519,0.870125,0.885542,0.889613,0.957662
gpt_writing,0.431158,0.836039,0.808178,0.878851,0.883069,0.953114


## Cross-Domain Training & Evaluation

In [None]:
results_cross_domain = defaultdict(list)
for _ in trange(5, position=0):
    seed_everything(config["seed"])

    human_subsets.insert(0, human_subsets.pop())
    for ai_subsets in ai_splits.values():
        ai_subsets.insert(0, ai_subsets.pop())

    eval_dataloader = get_dataloader(
        human_subsets[0],
        *[ai_subsets[0] for ai_subsets in ai_splits.values()],
    )
    train_dataloader = get_dataloader(
        *human_subsets[3:],
        *[subset for ai_subsets in ai_splits.values() for subset in ai_subsets[3:]],
        shuffle=True,
    )

    model = DocumentClassficationModel(**config)
    trainer = Trainer(
        max_epochs=config["max_epochs"],
        logger=pl_loggers.TensorBoardLogger(
            save_dir=f"logs/Ghostbuster/cross_domain/{type(featurizer).__name__}",
            name=agent,
        ),
        gradient_clip_val=config["gradient_clip_val"],
        callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)],
        deterministic=True,
    )
    trainer.progress_bar_callback.disable()

    trainer.fit(
        model,
        train_dataloaders=train_dataloader,
        val_dataloaders=eval_dataloader,
    )

    for agent, ai_subsets in tqdm(ai_splits.items(), position=1):
        (metrics,) = trainer.test(
            model, get_dataloader(*human_subsets[1:3], *ai_subsets[1:3]), verbose=False
        )
        results_cross_domain[agent].append(metrics)
        print(agent, metrics)


  0%|          | 0/5 [00:00<?, ?it/s]

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules 

  0%|          | 0/6 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


claude {'test_loss': 0.5283629298210144, 'test_acc@0.5': 0.8212500214576721, 'test_f1@0.5': 0.84573894739151, 'test_acc@best': 0.8787500262260437, 'test_f1@best': 0.887601375579834, 'test_roc_auc': 0.9706687331199646}
gpt {'test_loss': 0.5276883840560913, 'test_acc@0.5': 0.8237500190734863, 'test_f1@0.5': 0.8482239246368408, 'test_acc@best': 0.8824999928474426, 'test_f1@best': 0.8914549946784973, 'test_roc_auc': 0.9641781449317932}
gpt_prompt1 {'test_loss': 0.5279452204704285, 'test_acc@0.5': 0.8237500190734863, 'test_f1@0.5': 0.8482239246368408, 'test_acc@best': 0.8762500286102295, 'test_f1@best': 0.8850173950195312, 'test_roc_auc': 0.9586906433105469}
gpt_prompt2 {'test_loss': 0.5247037410736084, 'test_acc@0.5': 0.8274999856948853, 'test_f1@0.5': 0.8519313335418701, 'test_acc@best': 0.8799999952316284, 'test_f1@best': 0.8888888955116272, 'test_roc_auc': 0.9624156355857849}
gpt_semantic {'test_loss': 0.5324054956436157, 'test_acc@0.5': 0.8199999928474426, 'test_f1@0.5': 0.844492435455

Seed set to 42


gpt_writing {'test_loss': 0.5153222680091858, 'test_acc@0.5': 0.8287500143051147, 'test_f1@0.5': 0.8531618714332581, 'test_acc@best': 0.887499988079071, 'test_f1@best': 0.8965517282485962, 'test_roc_auc': 0.9693499803543091}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

  0%|          | 0/6 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


claude {'test_loss': 0.6655161380767822, 'test_acc@0.5': 0.75844806432724, 'test_f1@0.5': 0.7979057431221008, 'test_acc@best': 0.7759699821472168, 'test_f1@best': 0.8097768425941467, 'test_roc_auc': 0.9124373197555542}
gpt {'test_loss': 0.6214667558670044, 'test_acc@0.5': 0.7747184038162231, 'test_f1@0.5': 0.8140496015548706, 'test_acc@best': 0.7922403216362, 'test_f1@best': 0.8259958028793335, 'test_roc_auc': 0.9567449688911438}
gpt_prompt1 {'test_loss': 0.6167447566986084, 'test_acc@0.5': 0.7784730792045593, 'test_f1@0.5': 0.8177136778831482, 'test_acc@best': 0.7947434186935425, 'test_f1@best': 0.8284518718719482, 'test_roc_auc': 0.9528696537017822}
gpt_prompt2 {'test_loss': 0.612399697303772, 'test_acc@0.5': 0.7772215008735657, 'test_f1@0.5': 0.8164948225021362, 'test_acc@best': 0.7947434186935425, 'test_f1@best': 0.8284518718719482, 'test_roc_auc': 0.9618953466415405}
gpt_semantic {'test_loss': 0.6137882471084595, 'test_acc@0.5': 0.7772215008735657, 'test_f1@0.5': 0.816494822502136

Seed set to 42


gpt_writing {'test_loss': 0.6060510277748108, 'test_acc@0.5': 0.7822278141975403, 'test_f1@0.5': 0.8213552236557007, 'test_acc@best': 0.7997496724128723, 'test_f1@best': 0.8333333134651184, 'test_roc_auc': 0.9697869420051575}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

  0%|          | 0/6 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


claude {'test_loss': 0.4741660952568054, 'test_acc@0.5': 0.8245614171028137, 'test_f1@0.5': 0.8444444537162781, 'test_acc@best': 0.8157894611358643, 'test_f1@best': 0.8393442630767822, 'test_roc_auc': 0.9424999952316284}
gpt {'test_loss': 0.4461359679698944, 'test_acc@0.5': 0.829573929309845, 'test_f1@0.5': 0.8495575189590454, 'test_acc@best': 0.8245614171028137, 'test_f1@best': 0.8481561541557312, 'test_roc_auc': 0.9536117911338806}
gpt_prompt1 {'test_loss': 0.4463060796260834, 'test_acc@0.5': 0.8320801854133606, 'test_f1@0.5': 0.8520971536636353, 'test_acc@best': 0.8270676732063293, 'test_f1@best': 0.850649356842041, 'test_roc_auc': 0.948951005935669}
gpt_prompt2 {'test_loss': 0.4251119792461395, 'test_acc@0.5': 0.8421052694320679, 'test_f1@0.5': 0.862144410610199, 'test_acc@best': 0.8308270573616028, 'test_f1@best': 0.8543689250946045, 'test_roc_auc': 0.9601067900657654}
gpt_semantic {'test_loss': 0.4242076873779297, 'test_acc@0.5': 0.8383458852767944, 'test_f1@0.5': 0.8583973646163

Seed set to 42


gpt_writing {'test_loss': 0.4172142744064331, 'test_acc@0.5': 0.8446115255355835, 'test_f1@0.5': 0.864628791809082, 'test_acc@best': 0.8333333134651184, 'test_f1@best': 0.8568353056907654, 'test_roc_auc': 0.9709485173225403}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

  0%|          | 0/6 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


claude {'test_loss': 0.2881002128124237, 'test_acc@0.5': 0.897243082523346, 'test_f1@0.5': 0.9002432823181152, 'test_acc@best': 0.8546366095542908, 'test_f1@best': 0.8696629405021667, 'test_roc_auc': 0.966158926486969}
gpt {'test_loss': 0.33411896228790283, 'test_acc@0.5': 0.8872180581092834, 'test_f1@0.5': 0.8894348740577698, 'test_acc@best': 0.8458646535873413, 'test_f1@best': 0.8607021570205688, 'test_roc_auc': 0.9500596523284912}
gpt_prompt1 {'test_loss': 0.32562491297721863, 'test_acc@0.5': 0.8809523582458496, 'test_f1@0.5': 0.8825711011886597, 'test_acc@best': 0.8508771657943726, 'test_f1@best': 0.8658398985862732, 'test_roc_auc': 0.9526947140693665}
gpt_prompt2 {'test_loss': 0.28810012340545654, 'test_acc@0.5': 0.8984962701797485, 'test_f1@0.5': 0.9015795588493347, 'test_acc@best': 0.8621553778648376, 'test_f1@best': 0.8772321343421936, 'test_roc_auc': 0.961127519607544}
gpt_semantic {'test_loss': 0.2946299612522125, 'test_acc@0.5': 0.902255654335022, 'test_f1@0.5': 0.9055690169

Seed set to 42


gpt_writing {'test_loss': 0.2732551395893097, 'test_acc@0.5': 0.9072681665420532, 'test_f1@0.5': 0.9108433723449707, 'test_acc@best': 0.8621553778648376, 'test_f1@best': 0.8772321343421936, 'test_roc_auc': 0.9654051661491394}


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes  
--------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?          
1 | projection  | Identity          | 0      | train | [32, 16384] | [32, 16384]
2 | classifier  | Linear            | 16.4 K | train | [32, 16384] | [32, 1]    
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?          
--------------------------------------------------------------------------------------
168 K     Trainable params
0         Non-trainable params
168 K     Total params
0.674     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
  

  0%|          | 0/6 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


claude {'test_loss': 0.5803608298301697, 'test_acc@0.5': 0.7819548845291138, 'test_f1@0.5': 0.8187500238418579, 'test_acc@best': 0.8446115255355835, 'test_f1@best': 0.8628318309783936, 'test_roc_auc': 0.9629836678504944}
gpt {'test_loss': 0.5899897813796997, 'test_acc@0.5': 0.780701756477356, 'test_f1@0.5': 0.8175182342529297, 'test_acc@best': 0.8383458852767944, 'test_f1@best': 0.8565072417259216, 'test_roc_auc': 0.9560803771018982}
gpt_prompt1 {'test_loss': 0.5841663479804993, 'test_acc@0.5': 0.7832080125808716, 'test_f1@0.5': 0.819979190826416, 'test_acc@best': 0.8458646535873413, 'test_f1@best': 0.8640884160995483, 'test_roc_auc': 0.958574116230011}
gpt_prompt2 {'test_loss': 0.581856369972229, 'test_acc@0.5': 0.780701756477356, 'test_f1@0.5': 0.8175182342529297, 'test_acc@best': 0.8433583974838257, 'test_f1@best': 0.8615725636482239, 'test_roc_auc': 0.9590766429901123}
gpt_semantic {'test_loss': 0.5920119881629944, 'test_acc@0.5': 0.7819548845291138, 'test_f1@0.5': 0.81875002384185

In [15]:
df = pd.DataFrame(
    [
        {
            "agent": agent,
            **metric,
        }
        for agent, metrics in results_cross_domain.items()
        for metric in metrics
    ]
)
df = df.groupby("agent").mean()
print(df.to_latex(float_format="\\np{%.3f}"))
df

\begin{tabular}{lrrrrrr}
\toprule
 & test_loss & test_acc@0.5 & test_f1@0.5 & test_acc@best & test_f1@best & test_roc_auc \\
agent &  &  &  &  &  &  \\
\midrule
claude & \np{0.507} & \np{0.817} & \np{0.841} & \np{0.834} & \np{0.854} & \np{0.951} \\
gpt & \np{0.504} & \np{0.819} & \np{0.844} & \np{0.837} & \np{0.857} & \np{0.956} \\
gpt_prompt1 & \np{0.500} & \np{0.820} & \np{0.844} & \np{0.839} & \np{0.859} & \np{0.954} \\
gpt_prompt2 & \np{0.486} & \np{0.825} & \np{0.850} & \np{0.842} & \np{0.862} & \np{0.961} \\
gpt_semantic & \np{0.491} & \np{0.824} & \np{0.849} & \np{0.839} & \np{0.859} & \np{0.960} \\
gpt_writing & \np{0.478} & \np{0.829} & \np{0.854} & \np{0.846} & \np{0.866} & \np{0.968} \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,test_loss,test_acc@0.5,test_f1@0.5,test_acc@best,test_f1@best,test_roc_auc
agent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
claude,0.507301,0.816691,0.841416,0.833952,0.853843,0.95095
gpt,0.50388,0.819192,0.843757,0.836702,0.856563,0.956135
gpt_prompt1,0.500157,0.819693,0.844117,0.838961,0.858809,0.954356
gpt_prompt2,0.486434,0.825205,0.849934,0.842217,0.862103,0.960924
gpt_semantic,0.491409,0.823956,0.848741,0.838713,0.858542,0.95976
gpt_writing,0.477959,0.829464,0.854239,0.845721,0.865608,0.968103


In [13]:
import pandas as pd


def df_to_latex_heatmap(_df: pd.DataFrame):
    print(
        "\\plotHeatmap{"
        + ",".join(_df.index)
        + "}{%\n    "
        + ",%\n    ".join(
            [
                "{"
                + ",".join(f"{val:.4f}/{round(val, 2):.2f}" for val in row[1:])
                + "}"
                for row in _df.reset_index().values
            ]
        )
        + "%\n}{"
        + ",".join(_df.columns)
        + "}"
    )


In [14]:
_metric = "test_roc_auc"
# _metric = "test_acc@best"

results = []
for ai_counterpart in sources:
    results.append(
        [
            results_in_domain[ai_counterpart]["metrics"][i][_metric]
            for i in range(len(sources))
        ]
    )
results.append([m[_metric] for m in metrics_cross_domain[: len(sources)]])

df = pd.DataFrame(results, columns=sources, index=list(sources) + ["ALL"])
df["AVG"] = df.mean(axis=1)
df["AVG"][-1] = metrics_cross_domain[-1][_metric]
df

NameError: name 'sources' is not defined

In [None]:
df_to_latex_heatmap(df)

In [None]:
config

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

ax = sns.heatmap(
    df,
    annot=True,
    fmt=".2f",
    vmax=1.0,
    vmin=0.0,
    cmap=sns.cubehelix_palette(rot=-0.2, as_cmap=True),
    yticklabels=list(sources) + ["ALL"],
    xticklabels=list(sources) + ["AVG"],
    square=True,
    # reduce annotation font size
    annot_kws={"fontsize": 8},
    cbar=False,
)

# rotate x-axis labels by 45 degrees
# anchored at the right edge of the axes
for tick in ax.get_xticklabels():
    tick.set_rotation(45)
    tick.set_horizontalalignment("right")

plt.tight_layout()
# plt.savefig(
#     "../figures/evaluation-trained_in_domain-test_0.1-gpt2_256-rand_4-il_13_as_channels.pdf",
#     dpi=300,
# )
plt.show()

### LLR on Whole Datasets

In [None]:
from sklearn.metrics import auc, roc_curve

from luminar.baselines import llr_from_transition_scores
from simple_dataset import Dataset as SimpleDataset
from transition_scores.data import TransitionScores

results_llr = []
for ai_counterpart, split in datasets.items():
    dataset_test = (
        SimpleDataset(split)
        .flat_map(lambda doc: doc["features"])
        .map(
            lambda x: {
                "llr": llr_from_transition_scores(
                    TransitionScores(**x["transition_scores"])
                ),
                "labels": int(x["label"] != "human"),
            },
            in_place=False,
        )
    )
    fpr, tpr, _ = roc_curve(dataset_test["labels"], dataset_test["llr"])
    auroc = auc(fpr, tpr)

    preds = np.array(dataset_test["llr"])
    labels = np.array(dataset_test["labels"])

    mean_0 = float(np.mean(preds[labels == 0]))
    mean_1 = float(np.mean(preds[labels == 1]))

    thresholds = np.linspace(round(mean_0, 1) - 0.2, round(mean_1, 1) + 0.3, 5001)
    preds_thresholded: np.ndarray = preds > thresholds.reshape(-1, 1)
    acc_thresholded = np.mean((preds_thresholded == labels), axis=1)
    idx = np.argmax(acc_thresholded)
    best_threshold = thresholds[idx]
    best_acc = acc_thresholded[idx]

    results_llr.append(
        {
            "domain": ai_counterpart,
            "auroc": auroc,
            "best_acc": best_acc,
            "best_threshold": best_threshold,
        }
    )

pd.DataFrame(results_llr)

### LLR on Test Splits

In [None]:
from sklearn.metrics import auc, roc_curve

from luminar.baselines import llr_from_transition_scores
from simple_dataset import Dataset as SimpleDataset
from transition_scores.data import TransitionScores

results_llr = []
for ai_counterpart, split in test_splits.items():
    dataset_test = (
        SimpleDataset(split)
        .flat_map(lambda doc: doc["features"])
        .map(
            lambda x: {
                "llr": llr_from_transition_scores(
                    TransitionScores(**x["transition_scores"])
                ),
                "labels": int(x["label"] != "human"),
            },
            in_place=False,
        )
    )
    fpr, tpr, _ = roc_curve(dataset_test["labels"], dataset_test["llr"])
    auroc = auc(fpr, tpr)

    preds = np.array(dataset_test["llr"])
    labels = np.array(dataset_test["labels"])

    mean_0 = float(np.mean(preds[labels == 0]))
    mean_1 = float(np.mean(preds[labels == 1]))

    thresholds = np.linspace(round(mean_0, 1) - 0.2, round(mean_1, 1) + 0.3, 1001)
    preds_thresholded: np.ndarray = preds > thresholds.reshape(-1, 1)
    acc_thresholded = np.mean((preds_thresholded == labels), axis=1)
    idx = np.argmax(acc_thresholded)
    best_threshold = thresholds[idx]
    best_acc = acc_thresholded[idx]

    results_llr.append(
        {
            "domain": ai_counterpart,
            "auroc": auroc,
            "best_acc": best_acc,
            "best_threshold": best_threshold,
        }
    )

pd.DataFrame(results_llr)

In [None]:
raise RuntimeError("STOP")

In [None]:
sample = train_dataset[0]["features"].numpy().squeeze()
train_dataset[0]["labels"]

In [None]:
fig = plt.imshow(sample.T, cmap=sns.cubehelix_palette(rot=-0.2, as_cmap=True))
fig.axes.set_axis_off()
plt.show()

In [None]:
sample = train_dataset[2]["features"].numpy().squeeze()
train_dataset[2]["labels"]

In [None]:
fig = plt.imshow(sample.T, cmap=sns.cubehelix_palette(rot=-0.2, as_cmap=True))
fig.axes.set_axis_off()
plt.show()

In [None]:
raise RuntimeError("STOP")

In [None]:
from sklearn.metrics import auc, roc_curve

from luminar.baselines import llr_from_transition_scores

dataset_test = dm._dataset_test.map(
    lambda x: {
        "llr": llr_from_transition_scores(x["features"]),
        "labels": x["labels"],
    },
    in_place=False,
)
fpr, tpr, _ = roc_curve(dataset_test["labels"], dataset_test["llr"])
auc(fpr, tpr)