In [6]:
import os
import warnings

import pandas as pd
from dotenv import load_dotenv
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch import loggers as pl_loggers
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from tqdm.auto import tqdm, trange

load_dotenv("../.env")

warnings.filterwarnings("ignore", ".*does not have many workers.*")

In [7]:
from pathlib import Path
import openai


client = openai.OpenAI(api_key=(Path.home() / (".openai-key")).read_text().strip())

In [None]:
import numpy as np
import dill as pickle
import tiktoken

from ghostbuster.utils.featurize import t_featurize_logprobs, score_ngram
from ghostbuster.utils.symbolic import (
    train_trigram,
    get_words,
    vec_functions,
    scalar_functions,
)


MAX_TOKENS = 2047
best_features = open("ghostbuster/model/features.txt").read().strip().split("\n")

# Load davinci tokenizer
enc = tiktoken.encoding_for_model("davinci-002")

# Load model
model = pickle.load(open("ghostbuster/model/model", "rb"))
mu = pickle.load(open("ghostbuster/model/mu", "rb"))
sigma = pickle.load(open("ghostbuster/model/sigma", "rb"))

# Train trigram
print("Loading Trigram Model...")

trigram_model = train_trigram()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading Trigram Model...
Tokenizing corpus...


100%|██████████| 57340/57340 [00:02<00:00, 28387.86it/s]



Training n-gram model...


100%|██████████| 1283106/1283106 [00:00<00:00, 1672909.03it/s]
100%|██████████| 1283105/1283105 [00:01<00:00, 1100940.39it/s]
100%|██████████| 1283104/1283104 [00:02<00:00, 517280.78it/s]


In [None]:
def ghostbuster_pred(doc: str):
    trigram = np.array(
        score_ngram(doc, trigram_model, enc.encode, n=3, strip_first=False)
    )
    unigram = np.array(
        score_ngram(doc, trigram_model.base, enc.encode, n=1, strip_first=False)
    )

    response = client.completions.create(
        # model="ada",  # DEPRECATED, replaced by babbage-002
        model="babbage-002",
        prompt="<|endoftext|>" + doc,
        max_tokens=0,
        echo=True,
        logprobs=1,
    ).to_dict()
    ada = np.array(
        list(
            map(
                lambda x: np.exp(x),
                response["choices"][0]["logprobs"]["token_logprobs"][1:],
            )
        )
    )

    response = client.completions.create(
        # model="davinci",  # DEPRECATED, replaced by davinci-002
        model="davinci-002",
        prompt="<|endoftext|>" + doc,
        max_tokens=0,
        echo=True,
        logprobs=1,
    ).to_dict()
    davinci = np.array(
        list(
            map(
                lambda x: np.exp(x),
                response["choices"][0]["logprobs"]["token_logprobs"][1:],
            )
        )
    )

    subwords = response["choices"][0]["logprobs"]["tokens"][1:]
    gpt2_map = {"\n": "Ċ", "\t": "ĉ", " ": "Ġ"}
    for i in range(len(subwords)):
        for k, v in gpt2_map.items():
            subwords[i] = subwords[i].replace(k, v)

    t_features = t_featurize_logprobs(davinci, ada, subwords)

    vector_map = {
        "davinci-logprobs": davinci,
        "ada-logprobs": ada,
        "trigram-logprobs": trigram,
        "unigram-logprobs": unigram,
    }

    exp_features = []
    for exp in best_features:
        exp_tokens = get_words(exp)
        curr = vector_map[exp_tokens[0]]

        for i in range(1, len(exp_tokens)):
            if exp_tokens[i] in vec_functions:
                next_vec = vector_map[exp_tokens[i + 1]]
                curr = vec_functions[exp_tokens[i]](curr, next_vec)
            elif exp_tokens[i] in scalar_functions:
                exp_features.append(scalar_functions[exp_tokens[i]](curr))
                break

    data = (np.array(t_features + exp_features) - mu) / sigma
    preds = model.predict_proba(data.reshape(-1, 1).T)[:, 1]

    return preds

In [11]:
ghostbuster_pred("Hello, world! This is a test. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.")

array([1.])

### Luminar

In [None]:


from luminar.data import (
    PaddingDataloader,
    n_way_split,
)
from luminar.model import CNNDocumentClassficationModel
from luminar.mongo import MongoPipelineDataset

In [95]:
domains = {
    "Blog Authorship": {"domain": "blog_authorship_corpus"},
    "Student Essays": {"domain": "student_essays"},
    "CNN News": {"domain": "cnn_news"},
    "Euro Court Cases": {"domain": "euro_court_cases"},
    "House of Commons": {"domain": "house_of_commons"},
    "ArXiv Papers": {"domain": "arxiv_papers"},
    # "Gutenberg [EN]": {"domain": "gutenberg", "lang": "en-EN", "name": "gutenberg_en"},
    # "Gutenberg [DE]": {"domain": "gutenberg", "lang": "de-DE", "name": "gutenberg_de"},
    "Bundestag": {"domain": "bundestag"},
    "Spiegel": {"domain": "spiegel_articles"},
}

In [None]:
config = {
    "eval_split": 0.1,
    "test_split": 0.2,
    "synth_agent": "gpt-4o-mini",
    "document_type": "fulltext",
}

## Features

In [96]:
config["seed"] = 42

sizes = [0.1] * 10
subsets = {}
for domain, kwargs in tqdm(domains.items(), desc="Domains", position=0):
    domain_name = kwargs.pop("name", kwargs["domain"])
    seed_everything(config["seed"], verbose=False)
    subsets[domain] = n_way_split(
        MongoPipelineDataset(
            mongo_db_connection=os.getenv("MONGO_DB_CONNECTION"),
            pipeline=[
                {
                    "$match": {
                        "type": config["document_type"],
                        # "type": "fulltext",
                        "agent": {"$in": ["human", config["synth_agent"]]},
                        # "agent": {"$in": ["human", "gpt-4o-mini"]},
                        **kwargs,
                    }
                },
                {
                    "$project": {
                        "text": 1,
                        "agent": 1,
                        "label_str": "$label",
                        "label": {
                            "$cond": {
                                "if": {"$eq": ["$agent", "human"]},
                                "then": 0,
                                "else": 1,
                            }
                        },
                        "id": {
                            "$cond": {
                                "if": {"$eq": ["$agent", "human"]},
                                "then": "$id",
                                "else": "$source",
                            }
                        },
                    }
                },
                {
                    "$group": {
                        "_id": "$id",
                        "samples": {
                            "$push": {
                                "text": "$text",
                                "agent": "$agent",
                                "label_str": "$label_str",
                                "label": "$label",
                            }
                        },
                    }
                },
                {
                    "$match": {
                        "samples.1": {"$exists": True},
                    }
                },
            ],
            database="prismai",
            collection="dataset_PrismAI",
            # update_cache=True,
        ).load(verbose=False)[:1500],
        *sizes,
    )

Domains:   0%|          | 0/8 [00:00<?, ?it/s]

In [86]:
subsets[domain][0][0]

{'_id': '011b4eb0-2b82-441e-a0ff-1792ef9f036a',
 'samples': [{'text': 'Bundesverkehrsminister Andreas Scheuer (CSU) tut dieser Tage so, als sei er stets der größte Freund von Nachrüstung älterer Diesel-Fahrzeugen mit Stickoxidkatalysatoren gewesen. Er habe die Vorschriften für den Umbau erlassen, die Hersteller hätten ihm zugesagt, erste Systeme im ersten Halbjahr 2019 fertigzustellen. "Ich gehe davon aus, dass der Zeitplan von allen eingehalten wird", sagte er kürzlich der "Hannoverschen Allgemeinen Zeitung" - so als könne ihm alles nicht schnell genug gehen. Dabei hatte er die Hardwarenachrüstung lange Zeit bekämpft.\nDie Pionierarbeit erledigten andere, etwa der ADAC. Der Automobilverband begann vor einem Jahr damit, Hardwarenachrüstsets zu testen. Spezialisten fuhren Zehntausende Kilometer mit einem\nVW T5, ausgestattet mit einem Katalysator des Herstellers Oberland-Mangold, \neinem Opel Astra (Abgasreinigung von Twintec) \nund einem Fiat Ducato (Katalysator von HJS). \nDer Fiat er

In [None]:
splits = {}

for domain, subset in tqdm(subsets.items(), desc="Subsets", position=0):
    splits[domain] = [
        [
            {
                # Actually just the truncated text
                "features": enc.decode(
                    enc.encode(sample["text"].replace("<|endoftext|>", ""))[:MAX_TOKENS]
                ).strip(),
                "label": sample["label"],
            }
            for doc in tqdm(split, desc="Samples", position=2, leave=False)
            for sample in doc["samples"]
        ]
        for split in tqdm(subset, desc="Splits", position=1, leave=False)
    ]


In [89]:
splits[domain][0][0]

{'features': 'Bundesverkehrsminister Andreas Scheuer (CSU) tut dieser Tage so, als sei er stets der größte Freund von Nachrüstung älterer Diesel-Fahrzeugen mit Stickoxidkatalysatoren gewesen. Er habe die Vorschriften für den Umbau erlassen, die Hersteller hätten ihm zugesagt, erste Systeme im ersten Halbjahr 2019 fertigzustellen. "Ich gehe davon aus, dass der Zeitplan von allen eingehalten wird", sagte er kürzlich der "Hannoverschen Allgemeinen Zeitung" - so als könne ihm alles nicht schnell genug gehen. Dabei hatte er die Hardwarenachrüstung lange Zeit bekämpft.\nDie Pionierarbeit erledigten andere, etwa der ADAC. Der Automobilverband begann vor einem Jahr damit, Hardwarenachrüstsets zu testen. Spezialisten fuhren Zehntausende Kilometer mit einem\nVW T5, ausgestattet mit einem Katalysator des Herstellers Oberland-Mangold, \neinem Opel Astra (Abgasreinigung von Twintec) \nund einem Fiat Ducato (Katalysator von HJS). \nDer Fiat erlitt allerdings nach rund 30.000 Kilometern einen irrepar

In [98]:
sum(len(samples) for split in tqdm(splits.values()) for samples in split)

  0%|          | 0/8 [00:00<?, ?it/s]

24000

In [99]:
n_tokens = sum(
    sum(
        sum(
            1 + len(enc.encode(sample["features"]))
            for sample in samples  #
        )
        for samples in split
    )
    for split in tqdm(splits.values())
)
n_tokens

  0%|          | 0/8 [00:00<?, ?it/s]

21009717

In [100]:
n_tokens / 1_000_000 * 2.4

50.42332079999999

In [81]:
n_tokens_sub = sum(
    sum(
        sum(
            1 + len(enc.encode(sample["features"])[:MAX_TOKENS])
            for sample in samples[2700:3000]  #
        )
        for samples in split
    )
    for split in tqdm(splits.values())
)
n_tokens_sub

  0%|          | 0/10 [00:00<?, ?it/s]

1482474

In [82]:
n_tokens_sub / 1_000_000 * 2.4

3.5579376000000003

In [53]:
ghostbuster_pred(splits[domain][0][0]["features"])

array([0.95455812])

In [None]:
config |= {
    "projection_dim": 32,
    "learning_rate": 0.0001,
    "warmup_steps": 66,
    "max_epochs": 25,
    "gradient_clip_val": 1.0,
    "batch_size": 32,
}

In [9]:
from torch.utils.data import ConcatDataset


# capturing config from "closure"
def get_dataloader(*dataset, **kwargs) -> PaddingDataloader:
    if len(dataset) == 1:
        dataset = dataset[0]
    else:
        dataset = ConcatDataset(dataset)
    return PaddingDataloader(
        dataset,
        feature_dim=config["feature_dim"],
        batch_size=config["batch_size"],
        **kwargs,
    )

## In-Domain Training & Evaluation

In [10]:
from collections import defaultdict

metrics_in_domain = defaultdict(list)
for domain, subsets in tqdm(splits.items()):
    for _ in trange(5, desc=domain, position=1):
        seed_everything(config["seed"], verbose=False)
        # cycle through splits for cross-validation
        eval_dataset = subsets.pop(0)
        test_dataloader = get_dataloader(*subsets[:2])
        train_dataloader = get_dataloader(*subsets[2:], shuffle=True)
        eval_dataloader = get_dataloader(eval_dataset)
        subsets.append(eval_dataset)

        model = CNNDocumentClassficationModel(**config)
        trainer = Trainer(
            max_epochs=config["max_epochs"],
            logger=pl_loggers.TensorBoardLogger(
                save_dir=f"logs/in_domain/{type(featurizer).__name__}",
                name=domain,
            ),
            gradient_clip_val=config["gradient_clip_val"],
            callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)],
            deterministic=True,
        )
        trainer.progress_bar_callback.disable()

        trainer.fit(
            model,
            train_dataloaders=train_dataloader,
            val_dataloaders=eval_dataloader,
        )
        (metrics,) = trainer.test(model, test_dataloader, verbose=False)
        metrics_in_domain[domain].append(metrics)

  0%|          | 0/9 [00:00<?, ?it/s]

Blog Authorship:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Student Essays:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

CNN News:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Euro Court Cases:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

House of Commons:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

ArXiv Papers:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
`Trainer.fit` st

Gutenberg:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Bundestag:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Spiegel:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

In [11]:
config

{'eval_split': 0.1,
 'test_split': 0.2,
 'feature_model': 'gpt2',
 'synth_agent': 'gpt-4o-mini',
 'document_type': 'fulltext',
 'second_dim_as_channels': True,
 'feature_dim': TwoDimFeatures(width=256, height=13),
 'featurizer': 'IntermediateLikelihood(last_n=13)',
 'slicer': 'SliceRandomMultiple(size=64, multiple=4, stride=16, sort=False)',
 'num_samples': 1,
 'seed': 42,
 'projection_dim': 32,
 'learning_rate': 0.0001,
 'warmup_steps': 66,
 'max_epochs': 25,
 'gradient_clip_val': 1.0,
 'batch_size': 32,
 'conv_layer_shapes': [(64, 5, 1),
  (128, 3, 1),
  (128, 3, 1),
  (128, 3, 1),
  (64, 3, 1)]}

In [12]:
df = pd.DataFrame(
    [
        {
            "domain": domain,
            **{
                "test_auroc": metric["test_auroc"],
                "test_f1@0.5": metric["test_f1@0.5"],
            },
        }
        for domain in domains
        for metric in metrics_in_domain[domain]
    ]
)
df = (
    df.groupby("domain")
    .mean()
    .sort_index(key=lambda i: list(map(list(domains.keys()).index, i)))
)
print(
    df.to_latex(
        float_format="%.3f",
        index=False,
    )
)
df

\begin{tabular}{rr}
\toprule
test_auroc & test_f1@0.5 \\
\midrule
0.982 & 0.906 \\
0.976 & 0.912 \\
0.972 & 0.916 \\
0.990 & 0.937 \\
0.976 & 0.914 \\
0.987 & 0.925 \\
0.964 & 0.883 \\
0.943 & 0.865 \\
0.928 & 0.855 \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,test_auroc,test_f1@0.5
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
Blog Authorship,0.982168,0.905689
Student Essays,0.975569,0.912166
CNN News,0.972229,0.915783
Euro Court Cases,0.989806,0.936931
House of Commons,0.975987,0.914006
ArXiv Papers,0.986814,0.924906
Gutenberg,0.963568,0.883373
Bundestag,0.943048,0.865373
Spiegel,0.928374,0.854966


## Out-of-Domain

In [None]:
from collections import defaultdict

metrics_out_of_domain = defaultdict(list)
for domain in tqdm(splits.keys()):
    for _ in trange(5, desc=domain, position=1):
        seed_everything(config["seed"], verbose=False)
        train_subsets = []
        eval_subsets = []
        for other, subsets in splits.items():
            if other == domain:
                subsets.append(subsets.pop(0))
                test_dataset = subsets[:2]
            else:
                eval_dataset = subsets.pop(0)
                eval_subsets.append(eval_dataset)
                train_subsets.extend(subsets[2:])
                subsets.append(eval_dataset)

        train_dataloader = get_dataloader(*train_subsets, shuffle=True)
        eval_dataloader = get_dataloader(*eval_subsets)
        test_dataloader = get_dataloader(*test_dataset)

        model = CNNDocumentClassficationModel(**config)
        trainer = Trainer(
            max_epochs=config["max_epochs"],
            logger=pl_loggers.TensorBoardLogger(
                save_dir=f"logs/in_domain/{type(featurizer).__name__}",
                name=domain,
            ),
            gradient_clip_val=config["gradient_clip_val"],
            callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)],
            deterministic=True,
        )
        trainer.progress_bar_callback.disable()

        trainer.fit(
            model,
            train_dataloaders=train_dataloader,
            val_dataloaders=eval_dataloader,
        )
        (metrics,) = trainer.test(model, test_dataloader, verbose=False)
        metrics_out_of_domain[domain].append(metrics)

        print(domain, metrics)

In [14]:
df = pd.DataFrame(
    [
        {
            "domain": domain,
            **{
                "test_auroc": metric["test_auroc"],
                "test_f1@0.5": metric["test_f1@0.5"],
            },
        }
        for domain in domains
        for metric in metrics_out_of_domain[domain]
    ]
)
df = (
    df.groupby("domain")
    .mean()
    .sort_index(key=lambda i: list(map(list(domains.keys()).index, i)))
)
print(
    df.to_latex(
        float_format="%.3f",
        index=False,
    )
)
df

\begin{tabular}{rr}
\toprule
test_auroc & test_f1@0.5 \\
\midrule
0.529 & 0.564 \\
0.750 & 0.730 \\
0.974 & 0.905 \\
0.941 & 0.846 \\
0.961 & 0.924 \\
0.985 & 0.941 \\
0.958 & 0.881 \\
0.769 & 0.715 \\
0.819 & 0.718 \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,test_auroc,test_f1@0.5
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
Blog Authorship,0.528786,0.564271
Student Essays,0.749614,0.729688
CNN News,0.973518,0.905227
Euro Court Cases,0.94148,0.845686
House of Commons,0.961225,0.923868
ArXiv Papers,0.98527,0.940596
Gutenberg,0.958419,0.880942
Bundestag,0.768829,0.715218
Spiegel,0.818731,0.718269
