## Preprocess Summeval by Yale-LILY for collecting activations

### Load the dataset

In [None]:
from datasets import load_dataset


# Load the SummEval dataset
dataset = load_dataset("davidanugraha/SummEval")

README.md:   0%|          | 0.00/828 [00:00<?, ?B/s]

summeval.json:   0%|          | 0.00/9.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1700 [00:00<?, ? examples/s]

In [6]:
# Check column names to confirm field availability
print(dataset["train"].column_names)

['id', 'model_id', 'filepath', 'hyp', 'refs', 'source', 'expert_coherence', 'expert_consistency', 'expert_fluency', 'expert_relevance']


### Preprocess dataset and update attribute columns

In [10]:
import pandas as pd

def transform_data(row, threshold=3.0):
    # Compute the average score across all expert dimensions
    average_score = round((
        row["expert_coherence"] +
        row["expert_consistency"] +
        row["expert_fluency"] +
        row["expert_relevance"]
    ) / 4)

    # Assign a binary label based on the average score
    label = 1 if average_score >= threshold else 0

    return {
        "id": row["id"],
        "summary": row["hyp"],
        "doc": row["source"],
        "is_factual": row["expert_consistency"],  # factuality
        "average_overall_score": average_score  # Optional, to track raw scores
    }

# Apply transformation
transformed_dataset = dataset.map(transform_data)


Map:   0%|          | 0/1700 [00:00<?, ? examples/s]

In [11]:
# Inspect the transformed dataset
print(transformed_dataset["train"][0])  # Replace "train" with the appropriate split

{'id': 'dm-test-8764fb95bfad8ee849274873a92fb8d6b400eee2', 'model_id': 'M11', 'filepath': 'cnndm/dailymail/stories/8764fb95bfad8ee849274873a92fb8d6b400eee2.story', 'hyp': "paul merson was brought on with only seven minutes remaining in his team 's 0-0 draw with burnley . andros townsend scored the tottenham midfielder in the 89th minute . paul merson had another dig at andros townsend after his appearance . the midfielder had been brought on to the england squad last week . click here for all the latest arsenal news news .", 'refs': ["Andros Townsend an 83rd minute sub in Tottenham's draw with Burnley. He was unable to find a winner as the game ended without a goal. Townsend had clashed with Paul Merson last week over England call-up.", 'Sports columnist Paul Merson and Andros Townsend are in the midst of a twitter feud. Merson started it when Townsend was called up and wrote something disparaging about him in his column. Since then things have gone back and forth between the two.', 'M

In [12]:
print(transformed_dataset["train"].column_names)

['id', 'model_id', 'filepath', 'hyp', 'refs', 'source', 'expert_coherence', 'expert_consistency', 'expert_fluency', 'expert_relevance', 'summary', 'doc', 'is_factual', 'average_overall_score']


### Remove extra columns

In [13]:
# List of columns you want to keep
columns_to_keep = ["id", "summary", "doc", "is_factual", "average_overall_score"]

# Remove unnecessary columns
transformed_dataset = transformed_dataset.remove_columns(
    [col for col in transformed_dataset["train"].column_names if col not in columns_to_keep]
)

# Inspect the cleaned dataset
print(transformed_dataset["train"].column_names)


['id', 'summary', 'doc', 'is_factual', 'average_overall_score']


In [14]:
# Save the transformed dataset to disk
transformed_dataset.save_to_disk("cnndm_factual")

Saving the dataset (0/1 shards):   0%|          | 0/1700 [00:00<?, ? examples/s]

### Load the preprocessed dataset

In [1]:
from datasets import Dataset

dataset = Dataset.load_from_disk("cnndm_factual/train")
print(dataset)

Dataset({
    features: ['id', 'summary', 'doc', 'is_factual', 'average_overall_score'],
    num_rows: 1700
})


In [4]:
print(dataset[200])

{'id': 'dm-test-eeef09d26cf30c2124c0399b08eedc6321fe5d20', 'summary': 'neymar and alves headed to watch el clasico on thursday night . dani alves proved their dedication to barcelona by supporting basketball side . real madrid remain top of their euro league division over their bitter rivals .', 'doc': "Team-mates Neymar and Dani Alves proved their dedication to Barcelona by supporting the club’s basketball side. Neymar and Alves headed to watch El Clasico on Thursday night alongside the Brazilian's sister Rafaella. Barca prevailed with a narrow 85-80 victory in the Euro League contest. Brazil star Neymar (centre) takes a selfie with friends and Barcelona team-mate Dani Alves (right) However Real Madrid remain top of their Euro League division over their bitter rivals, just by points difference. Neymar helped Brazil beat Chile 1-0 at the Emirates Stadium on Sunday in a feisty contest and had to withstand several brutal challenges from the South American rivals. Before the international