In [1]:
import os
import warnings

import pandas as pd
from dotenv import load_dotenv
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch import loggers as pl_loggers
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from tqdm.auto import tqdm, trange

from luminar.data import (
    FeatureDataset,
    PaddingDataloader,
    n_way_split,
)
from luminar.model import CNNDocumentClassficationModel, ConvolutionalLayerSpec
from luminar.features import FeatureExtractor, OneDimFeatures, Slicer, TwoDimFeatures
from luminar.mongo import PrismaiDataset

load_dotenv("../.env")

warnings.filterwarnings("ignore", ".*does not have many workers.*")

In [2]:
domains = {
    "Blog Authorship": {"domain": "blog_authorship_corpus"},
    "Student Essays": {"domain": "student_essays"},
    "CNN News": {"domain": "cnn_news"},
    "Euro Court Cases": {"domain": "euro_court_cases"},
    "House of Commons": {"domain": "house_of_commons"},
    "ArXiv Papers": {"domain": "arxiv_papers"},
    "Gutenberg": {"domain": "gutenberg", "lang": "en-EN"},
    "Bundestag": {"domain": "bundestag"},
    "Spiegel": {"domain": "spiegel_articles"},
}

In [3]:
config = {
    "eval_split": 0.1,
    "test_split": 0.2,
    "feature_model": "gpt2",
    "synth_agent": "gpt-4o-mini",
    "document_type": "fulltext",
}

## Features

In [6]:
first_dim = 256
k = 13

feature_dim = TwoDimFeatures(first_dim, k)
featurizer = FeatureExtractor.IntermediateLikelihood(k)
config["second_dim_as_channels"] = True

# slicer = Slicer.First(first_dim)
multiple = 4
slicer = Slicer.RandomMultiple(first_dim // multiple, multiple=multiple, stride=16)

config["feature_dim"] = feature_dim
config["featurizer"] = repr(featurizer)
config["slicer"] = repr(slicer)

config["num_samples"] = 1


def featurize(dataset) -> FeatureDataset:
    return FeatureDataset.from_prismai(
        tqdm(dataset, position=1, leave=False),
        slicer,
        featurizer,
        num_samples=config["num_samples"],
    )

In [7]:
config["seed"] = 42

sizes = [0.1] * 10
splits = {}
for domain, kwargs in tqdm(domains.items(), desc="Domains", position=0):
    seed_everything(config["seed"], verbose=False)
    splits[domain] = [
        featurize(subset)
        for subset in n_way_split(
            PrismaiDataset(
                mongo_db_connection=os.getenv("MONGO_DB_CONNECTION"),
                database="prismai",
                collection="features_prismai",
                feature_model=config["feature_model"],
                synth_agent=config["synth_agent"],
                document_type=config["document_type"],
                additional_match_conditions=config.get(
                    "additional_match_conditions", {}
                ),
                **kwargs,
                # update_cache=True,
            ).load(verbose=False),
            *sizes,
        )
    ]

Domains:   0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:03<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

In [8]:
config |= {
    "projection_dim": 32,
    "learning_rate": 0.0001,
    "warmup_steps": 66,
    "max_epochs": 25,
    "gradient_clip_val": 1.0,
    "batch_size": 32,
}

# SeqXGPT Layer Configuration
config["conv_layer_shapes"] = [
    ConvolutionalLayerSpec(64, 5),
    *[ConvolutionalLayerSpec(128, 3)] * 3,
    ConvolutionalLayerSpec(64, 3),
]

In [9]:
from torch.utils.data import ConcatDataset


# capturing config from "closure"
def get_dataloader(*dataset, **kwargs) -> PaddingDataloader:
    if len(dataset) == 1:
        dataset = dataset[0]
    else:
        dataset = ConcatDataset(dataset)
    return PaddingDataloader(
        dataset,
        feature_dim=config["feature_dim"],
        batch_size=config["batch_size"],
        **kwargs,
    )

## In-Domain Training & Evaluation

In [10]:
from collections import defaultdict

metrics_in_domain = defaultdict(list)
for domain, subsets in tqdm(splits.items()):
    for _ in trange(5, desc=domain, position=1):
        seed_everything(config["seed"], verbose=False)
        # cycle through splits for cross-validation
        eval_dataset = subsets.pop(0)
        test_dataloader = get_dataloader(*subsets[:2])
        train_dataloader = get_dataloader(*subsets[2:], shuffle=True)
        eval_dataloader = get_dataloader(eval_dataset)
        subsets.append(eval_dataset)

        model = CNNDocumentClassficationModel(**config)
        trainer = Trainer(
            max_epochs=config["max_epochs"],
            logger=pl_loggers.TensorBoardLogger(
                save_dir=f"logs/in_domain/{type(featurizer).__name__}",
                name=domain,
            ),
            gradient_clip_val=config["gradient_clip_val"],
            callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)],
            deterministic=True,
        )
        trainer.progress_bar_callback.disable()

        trainer.fit(
            model,
            train_dataloaders=train_dataloader,
            val_dataloaders=eval_dataloader,
        )
        (metrics,) = trainer.test(model, test_dataloader, verbose=False)
        metrics_in_domain[domain].append(metrics)

  0%|          | 0/9 [00:00<?, ?it/s]

Blog Authorship:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Student Essays:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

CNN News:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Euro Court Cases:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

House of Commons:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

ArXiv Papers:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
`Trainer.fit` st

Gutenberg:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Bundestag:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

Spiegel:   0%|          | 0/5 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name        | Type              | Params | Mode  | In sizes    | Out sizes
------------------------------------------------------------------------------------
0 | conv_layers | Sequential        | 152 K  | train | ?           | ?        
1 | projection  | Sequential        | 524 K  | train | [32, 16384] | [32, 32] 
2 | classifier  | Linear            | 33     | train | [32, 32]    | [32, 1]  
3 | criterion   | BCEWithLogitsLoss | 0      | train | ?           | ?        
------------------------------------------------------------------------------------
676 K     Trainable params
0         Non-trainable params
676 K     Total params
2.706     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode
LOCAL_RANK: 0 - 

In [11]:
config

{'eval_split': 0.1,
 'test_split': 0.2,
 'feature_model': 'gpt2',
 'synth_agent': 'gpt-4o-mini',
 'document_type': 'fulltext',
 'second_dim_as_channels': True,
 'feature_dim': TwoDimFeatures(width=256, height=13),
 'featurizer': 'IntermediateLikelihood(last_n=13)',
 'slicer': 'SliceRandomMultiple(size=64, multiple=4, stride=16, sort=False)',
 'num_samples': 1,
 'seed': 42,
 'projection_dim': 32,
 'learning_rate': 0.0001,
 'warmup_steps': 66,
 'max_epochs': 25,
 'gradient_clip_val': 1.0,
 'batch_size': 32,
 'conv_layer_shapes': [(64, 5, 1),
  (128, 3, 1),
  (128, 3, 1),
  (128, 3, 1),
  (64, 3, 1)]}

In [12]:
df = pd.DataFrame(
    [
        {
            "domain": domain,
            **{
                "test_auroc": metric["test_auroc"],
                "test_f1@0.5": metric["test_f1@0.5"],
            },
        }
        for domain in domains
        for metric in metrics_in_domain[domain]
    ]
)
df = (
    df.groupby("domain")
    .mean()
    .sort_index(key=lambda i: list(map(list(domains.keys()).index, i)))
)
print(
    df.to_latex(
        float_format="%.3f",
        index=False,
    )
)
df

\begin{tabular}{rr}
\toprule
test_auroc & test_f1@0.5 \\
\midrule
0.982 & 0.906 \\
0.976 & 0.912 \\
0.972 & 0.916 \\
0.990 & 0.937 \\
0.976 & 0.914 \\
0.987 & 0.925 \\
0.964 & 0.883 \\
0.943 & 0.865 \\
0.928 & 0.855 \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,test_auroc,test_f1@0.5
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
Blog Authorship,0.982168,0.905689
Student Essays,0.975569,0.912166
CNN News,0.972229,0.915783
Euro Court Cases,0.989806,0.936931
House of Commons,0.975987,0.914006
ArXiv Papers,0.986814,0.924906
Gutenberg,0.963568,0.883373
Bundestag,0.943048,0.865373
Spiegel,0.928374,0.854966


## Out-of-Domain

In [None]:
from collections import defaultdict

metrics_out_of_domain = defaultdict(list)
for domain in tqdm(splits.keys()):
    for _ in trange(5, desc=domain, position=1):
        seed_everything(config["seed"], verbose=False)
        train_subsets = []
        eval_subsets = []
        for other, subsets in splits.items():
            if other == domain:
                subsets.append(subsets.pop(0))
                test_dataset = subsets[:2]
            else:
                eval_dataset = subsets.pop(0)
                eval_subsets.append(eval_dataset)
                train_subsets.extend(subsets[2:])
                subsets.append(eval_dataset)

        train_dataloader = get_dataloader(*train_subsets, shuffle=True)
        eval_dataloader = get_dataloader(*eval_subsets)
        test_dataloader = get_dataloader(*test_dataset)

        model = CNNDocumentClassficationModel(**config)
        trainer = Trainer(
            max_epochs=config["max_epochs"],
            logger=pl_loggers.TensorBoardLogger(
                save_dir=f"logs/in_domain/{type(featurizer).__name__}",
                name=domain,
            ),
            gradient_clip_val=config["gradient_clip_val"],
            callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)],
            deterministic=True,
        )
        trainer.progress_bar_callback.disable()

        trainer.fit(
            model,
            train_dataloaders=train_dataloader,
            val_dataloaders=eval_dataloader,
        )
        (metrics,) = trainer.test(model, test_dataloader, verbose=False)
        metrics_out_of_domain[domain].append(metrics)

        print(domain, metrics)

In [14]:
df = pd.DataFrame(
    [
        {
            "domain": domain,
            **{
                "test_auroc": metric["test_auroc"],
                "test_f1@0.5": metric["test_f1@0.5"],
            },
        }
        for domain in domains
        for metric in metrics_out_of_domain[domain]
    ]
)
df = (
    df.groupby("domain")
    .mean()
    .sort_index(key=lambda i: list(map(list(domains.keys()).index, i)))
)
print(
    df.to_latex(
        float_format="%.3f",
        index=False,
    )
)
df

\begin{tabular}{rr}
\toprule
test_auroc & test_f1@0.5 \\
\midrule
0.529 & 0.564 \\
0.750 & 0.730 \\
0.974 & 0.905 \\
0.941 & 0.846 \\
0.961 & 0.924 \\
0.985 & 0.941 \\
0.958 & 0.881 \\
0.769 & 0.715 \\
0.819 & 0.718 \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,test_auroc,test_f1@0.5
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
Blog Authorship,0.528786,0.564271
Student Essays,0.749614,0.729688
CNN News,0.973518,0.905227
Euro Court Cases,0.94148,0.845686
House of Commons,0.961225,0.923868
ArXiv Papers,0.98527,0.940596
Gutenberg,0.958419,0.880942
Bundestag,0.768829,0.715218
Spiegel,0.818731,0.718269
