# HarmEval Benchmark Prompt + Label Sampler and EDA

This notebook:
- Loads `SoftMINER-Group/HarmEval`.
- Samples 500 rows with a fixed random seed (17).
- Saves a CSV with columns `Question` and `Topic`.
- Includes a small EDA section: question length stats, topic distribution, and simple plots.

It does **not** run any models; it only prepares and analyzes the benchmark data.


In [None]:
# Install required libraries
!pip install -q datasets pandas matplotlib seaborn

import os
import numpy as np
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns

SEED = 17
np.random.seed(SEED)

OUTPUT_DIR = "."  # current folder: 'toxicity metric'
sns.set(style="whitegrid")


In [None]:
# Load the HarmEval dataset
harmeval = load_dataset("SoftMINER-Group/HarmEval")
print(harmeval)

if "test" in harmeval:
    harmeval_split = harmeval["test"]
elif "validation" in harmeval:
    harmeval_split = harmeval["validation"]
else:
    harmeval_split = harmeval["train"]

print("Using split:", harmeval_split)
print("Columns:", harmeval_split.column_names)


In [None]:
# Convert to pandas and sample rows

df = harmeval_split.to_pandas()

assert "Question" in df.columns, "Expected a 'Question' column in HarmEval."
assert "Topic" in df.columns, "Expected a 'Topic' column in HarmEval."

df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

N = min(500, len(df))
sample_df = df.iloc[:N].copy()
print("Number of rows selected:", len(sample_df))


In [None]:
# Build final dataframe with Question and Topic

prompts_df = sample_df[["Question", "Topic"]].copy()
prompts_df["Question"] = prompts_df["Question"].astype(str)
prompts_df["Topic"] = prompts_df["Topic"].astype(str)

assert list(prompts_df.columns) == ["Question", "Topic"]
print(prompts_df.head())


In [None]:
# Save sampled prompts to CSV

output_path = os.path.join(OUTPUT_DIR, "harmeval_prompts_labeled.csv")
prompts_df.to_csv(output_path, index=False)
print("Saved labeled HarmEval prompts to:", output_path)
print("Total rows in CSV:", len(prompts_df))


In [None]:
# EDA: basic statistics

prompts_df["char_len"] = prompts_df["Question"].str.len()
prompts_df["word_len"] = prompts_df["Question"].str.split().str.len()

print("=== Question length (characters) ===")
print(prompts_df["char_len"].describe())

print("
=== Question length (words) ===")
print(prompts_df["word_len"].describe())

print("
=== Topic distribution ===")
topic_counts = prompts_df["Topic"].value_counts()
print(topic_counts)

topic_percent = 100 * topic_counts / len(prompts_df)
topic_summary = pd.DataFrame({
    "count": topic_counts,
    "percent": topic_percent.round(2),
})
print("
=== Topic distribution (count and %) ===")
print(topic_summary)

topic_summary_path = os.path.join(OUTPUT_DIR, "harmeval_topic_summary.csv")
topic_summary.to_csv(topic_summary_path, index_label="Topic")
print("Saved topic summary to:", topic_summary_path)

length_by_topic = prompts_df.groupby("Topic")[ ["word_len", "char_len"] ].agg(["mean", "median"])
print("
=== Question length by Topic (mean/median) ===")
print(length_by_topic)

length_by_topic_path = os.path.join(OUTPUT_DIR, "harmeval_length_by_topic.csv")
length_by_topic.to_csv(length_by_topic_path)
print("Saved length-by-topic stats to:", length_by_topic_path)


In [None]:
# Plot: Histogram of word lengths

plt.figure(figsize=(6,4))
plt.hist(prompts_df["word_len"], bins=30)
plt.xlabel("Words per question")
plt.ylabel("Count")
plt.title("Distribution of question length (words)")
hist_path = os.path.join(OUTPUT_DIR, "harmeval_word_len_hist.png")
plt.savefig(hist_path, bbox_inches="tight", dpi=150)
plt.show()
print("Saved:", hist_path)


In [None]:
# Plot: Topic counts bar chart

plt.figure(figsize=(8,4))
sns.barplot(x=topic_counts.index, y=topic_counts.values)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count")
plt.title("HarmEval topic distribution")
topic_bar_path = os.path.join(OUTPUT_DIR, "harmeval_topic_counts.png")
plt.savefig(topic_bar_path, bbox_inches="tight", dpi=150)
plt.show()
print("Saved:", topic_bar_path)


In [None]:
# Plot: Boxplot of word length by topic

plt.figure(figsize=(8,4))
sns.boxplot(data=prompts_df, x="Topic", y="word_len")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Words per question")
plt.title("Question length by topic")
box_path = os.path.join(OUTPUT_DIR, "harmeval_word_len_by_topic.png")
plt.savefig(box_path, bbox_inches="tight", dpi=150)
plt.show()
print("Saved:", box_path)


## Summary

- 500 sampled rows from `SoftMINER-Group/HarmEval` with `Question` and `Topic`.
- CSV file: `harmeval_prompts_labeled.csv`.
- EDA outputs:
  - `harmeval_topic_summary.csv`
  - `harmeval_length_by_topic.csv`
  - plots: `harmeval_word_len_hist.png`, `harmeval_topic_counts.png`, `harmeval_word_len_by_topic.png`.

These statistics and plots can be used directly in the reportâ€™s EDA section for the toxicity benchmark. This notebook does **not** perform any model generation or DeepEval code; it only prepares and analyzes the benchmark data.
