In [10]:
import csv
import os
from collections import Counter

import pandas
from pandas import DataFrame    
import requests
from datasets import load_dataset
import re

# Code adapted from Summac code: https://github.com/tingofurro/summac/blob/master/summac/benchmark.py

In [2]:
xsum = load_dataset("xsum")["test"]
xsumid2article = {d["id"]: d["document"] for d in xsum}

In [3]:
def clean_text(original_string):
    # Pattern to find the text starting with "Media playback" and ending with "device"
    pattern = r"Media playback.*?device"
    
    # Remove the specified part from the original string
    modified_string = re.sub(pattern, "", original_string)

    return modified_string

In [4]:
def load_xsumfaith():
    # On Faithfulness and Factuality in Abstractive Summarization - ACL 2020
    # https://github.com/google-research-datasets/xsum_hallucination_annotations
    # https://aclanthology.org/2020.acl-main.173.pdf
    dataset_folder = "test"
    csv_file = requests.get("https://github.com/google-research-datasets/xsum_hallucination_annotations/raw/master/hallucination_annotations_xsum_summaries.csv")
    with open(os.path.join(dataset_folder, "hallucination_annotations_xsum_summaries.csv"), "wb") as f:
        f.write(csv_file.content)

    path_to_annotation = os.path.join(dataset_folder, "hallucination_annotations_xsum_summaries.csv")

    with open(path_to_annotation, "r") as f:
        raw_data = list(csv.reader(f))
        dataset = []
        keys = raw_data[0]
        for line in raw_data[1:]:
            dataset.append({k: v for k, v in zip(keys, line)})

    groups = {}
    for d in dataset:
        k = (d["bbcid"], d["system"])
        if k not in groups:
            groups[k] = []
        groups[k].append(d)

    clean_dataset = []
    cleaned_documents = 0
    for k, vs in groups.items():
        A = vs[0]
        document = xsumid2article[A["bbcid"]]
        cleaned_document = clean_text(document)
        if len(cleaned_document) != len(document):
            cleaned_documents += 1
        if len(cleaned_document) < 10:
            print(f"Document {cleaned_document} is too short, skipping")
            continue
        labels = [v["hallucination_type"] for v in vs]
        annotations = ["faithful" if label == "NULL" else label for label in labels]
        most_common_label = Counter(labels).most_common(1)[0][0]
        label = "faithful" if most_common_label == "NULL" else most_common_label
        #c = "train" if len(clean_dataset) % 2 == 0 else "test"

        clean_dataset.append({"document": cleaned_document, "claim": A["summary"], "bbcid": A["bbcid"], "model_name": A["system"], "label": label, "split": "not set", "annotations": annotations})
    print(f"Cleaned {cleaned_documents} documents")
    return pandas.DataFrame(clean_dataset)

In [5]:
clean_xsum_faith_df = load_xsumfaith()

Cleaned 170 documents


In [6]:
clean_xsum_faith_df.head()

Unnamed: 0,document,claim,bbcid,model_name,label,split,annotations
0,France's Dubuisson carded a 67 to tie with ove...,rory mcilroy will take a one-shot lead into th...,34687720,BERTS2S,extrinsic,not set,"[extrinsic, extrinsic, extrinsic]"
1,Sheikh Ali Salman told the BBC that for nation...,the leader of bahrain\'s main opposition party...,21267591,BERTS2S,faithful,not set,"[faithful, faithful, extrinsic, intrinsic]"
2,He died at his home in Cambridge following an ...,veteran classical music conductor christopher ...,29347895,BERTS2S,extrinsic,not set,"[extrinsic, extrinsic, intrinsic, extrinsic, i..."
3,"In the year to the end of March, 57 victims of...",the number of homicides recorded by police in ...,37618111,BERTS2S,extrinsic,not set,"[extrinsic, extrinsic, extrinsic]"
4,The Cherries went down 2-1 at Sunderland on Sa...,bournemouth manager eddie howe says his side a...,37895159,BERTS2S,extrinsic,not set,"[extrinsic, extrinsic, extrinsic, extrinsic, e..."


In [7]:
clean_xsum_faith_df["label"].value_counts()

label
extrinsic    1795
intrinsic     448
faithful      257
Name: count, dtype: int64

In [8]:
clean_xsum_faith_df["split"].value_counts()

split
not set    2500
Name: count, dtype: int64

In [9]:
train_size = 1050
val_size = 200
test_size = 1250  # Or use the remainder

# Group by 'bbcid'
grouped_xsum_faith = clean_xsum_faith_df.groupby('bbcid')

In [11]:
shuffled_groups = grouped_xsum_faith.apply(lambda x: x.sample(frac=1)).reset_index(drop=True)

# Allocate groups to train, validation, and test sets
train = DataFrame()
val = DataFrame()
test = DataFrame()

for _, group in shuffled_groups.groupby('bbcid'):
    if len(train) < train_size:
        train = pandas.concat([train, group])
    elif len(val) < val_size:
        val = pandas.concat([val, group])
    else:
        test = pandas.concat([test, group])

# Ensure the total size is correct
if len(train) > train_size or len(val) > val_size:
    print("Adjustment needed due to group sizes.")


In [12]:
train["label"].value_counts()

label
extrinsic    727
intrinsic    197
faithful     126
Name: count, dtype: int64

In [13]:
val["label"].value_counts()

label
extrinsic    143
intrinsic     39
faithful      18
Name: count, dtype: int64

In [14]:
test["label"].value_counts()

label
extrinsic    925
intrinsic    212
faithful     113
Name: count, dtype: int64

In [15]:
test_ids = set(test["bbcid"].tolist())
train_ids = set(train["bbcid"].tolist())
val_ids = set(val["bbcid"].tolist())
assert len(test_ids.intersection(train_ids)) == 0 and len(train_ids.intersection(val_ids)) == 0

In [16]:
from datasets import DatasetDict, Dataset

clean_xsum_faith_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val.reset_index(drop=True)),
    "test": Dataset.from_pandas(test.reset_index(drop=True))
})

clean_xsum_faith_dataset_dict.push_to_hub("mtc/full_cleaned_xsum_faith")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 447.27ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 334.45ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s]
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 473.40ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
Deleting unused files from dataset repository: 100%|███