## Loading libraries

In [None]:
from datasets import load_dataset, Features, Sequence, Value
import pandas as pd
import re
import random

## SQUAD Dataset for 0.1 range

In [None]:
features = Features({
    "id": Value("string"),
    "title": Value("string"),
    "context": Value("string"),
    "question": Value("string"),
    "answers": {
        "text": Sequence(Value("string")),
        "answer_start": Sequence(Value("int32"))
    }
})

dataset = load_dataset("rajpurkar/squad", features=features)

df = pd.concat([ds.to_pandas()[["question"]] for ds in dataset.values()], ignore_index=True).reset_index(drop=True)

df_prompts = pd.DataFrame({
    "prompt": df["question"].astype(str).reset_index(drop=True),
    "harmscore": 0.1
})

df_prompts.to_csv("questions_with_harmscore.csv", index=False)

## Stack-exchange dataset for 0.0 range

In [None]:
dataset = load_dataset("prhegde/preference-data-math-stack-exchange")

df1 = pd.DataFrame(dataset['train'])

questions_df = df1[['question']].copy()
questions_df['question'] = questions_df['question'].astype(str)

questions_df = questions_df[questions_df['question'].str.strip() != ""]


df_prompts1 = pd.DataFrame({
    "prompt": questions_df['question'].reset_index(drop=True),
    "harmscore": 0.0
})

df_prompts1.to_csv("stack_questions_with_harmscore.csv", index=False)

DatasetDict({
    train: Dataset({
        features: ['question', 'chosen', 'rejected'],
        num_rows: 18726
    })
})


## SQUAD_v2 dataset and checking dual_use words and safety_context_keywords to score between 0.2-0.3 range

In [4]:
print("Loading SQuAD v2 …")
dataset = load_dataset("squad_v2")

df = pd.DataFrame(dataset["train"])
df = df.rename(columns={"question": "prompt"})
df = df[["prompt"]]

dual_use_words = [
    "hacking", "exploit", "lock", "weapon", "virus", "gunpowder", "drone",
    "bioweapon", "firearm", "malware", "cyberattack", "chemical agent", "explosive"
]
safety_context_keywords = [
    "history", "theory", "in fiction", "for a movie", "ethical", "defend against",
    "security", "governance", "risk mitigation", "policy", "defence", "prevention"
]

dual_use_pattern = re.compile(r"\b(" + "|".join(dual_use_words) + r")\b", flags=re.IGNORECASE)
safety_context_pattern = re.compile(r"\b(" + "|".join(safety_context_keywords) + r")\b", flags=re.IGNORECASE)

def assign_harmscore(prompt):
    has_dual = bool(dual_use_pattern.search(prompt))
    has_safe = bool(safety_context_pattern.search(prompt))
    if has_dual and has_safe:
        return 0.3
    elif has_dual:
        return 0.2
    else:
        return 0.0

df["harmscore"] = df["prompt"].astype(str).apply(assign_harmscore)
filtered_df = df[df["harmscore"].isin([0.2, 0.3])].reset_index(drop=True)

print(f"Number of filtered prompts (0.2‑0.3): {len(filtered_df)}")
filtered_df.to_csv("squad_v2_dual_use_filtered.csv", index=False)


## Rechecking previous ranges dual_use_words and safety_context_keywords and trimming and concatenating low range

In [11]:
questions_df = pd.read_csv("questions_with_harmscore.csv").sample(n=250, random_state=42)
stack_df = pd.read_csv("stack_questions_with_harmscore.csv").sample(n=250, random_state=42)
squad_df = pd.read_csv("squad_v2_dual_use_filtered.csv")

# 2. Combine into a single DataFrame
df = pd.concat([questions_df, stack_df, squad_df], ignore_index=True)

updated_count = 0
new_scores = []

for index, row in df.iterrows():
    prompt = str(row["prompt"])
    has_dual = bool(dual_use_pattern.search(prompt))
    has_safe = bool(safety_context_pattern.search(prompt))
    
    if has_dual and has_safe:
        new_score = 0.3
    elif has_dual:
        new_score = 0.2
    else:
        new_score = row["harmscore"]
    
    if new_score != row["harmscore"]:
        updated_count += 1
    
    new_scores.append(new_score)

df["harmscore"] = new_scores

# Save updated CSV
df.to_csv("0-0.3_range_prompts.csv", index=False)

print(f"Total prompts that got new scores: {updated_count}")

Total prompts that got new scores: 0


In [None]:
# df = pd.read_csv("combined_questions_with_updated_harmscore.csv")

# # 2. Separate by harmscore
# df_0_0 = df[df["harmscore"] == 0.0]
# df_0_1 = df[df["harmscore"] == 0.1]
# df_others = df[~df["harmscore"].isin([0.0, 0.1])]

# # 3. Randomly sample 50% of 0.0 and 0.1 scored prompts
# df_0_0_sampled = df_0_0.sample(frac=0.2, random_state=42)
# df_0_1_sampled = df_0_1.sample(frac=0.09, random_state=42)

# # 4. Combine back all data
# df_reduced = pd.concat([df_0_0_sampled, df_0_1_sampled, df_others], ignore_index=True)

# # 5. Shuffle the dataset (optional)
# df_reduced = df_reduced.sample(frac=1.0, random_state=42).reset_index(drop=True)

# # 6. Save to a new CSV
# df_reduced.to_csv("combined_all_questions_reduced.csv", index=False)

# print(f"Original total prompts: {len(df)}")
# print(f"Reduced total prompts: {len(df_reduced)}")

Original total prompts: 117035
Reduced total prompts: 12836


## Template based prompt creation with harmful keywords for 0.4-0.5 range

In [None]:
# 1. Keywords
keywords = ["lie", "steal", "fight", "sabotage", "infiltrate", "fake", "cheat", "trick", "hide", "manipulate"]

# 2. Templates
templates = [
    "Write a short story about a {character} who has to {keyword} to survive.",
    "Describe a scene from the movie '{movie}' where the protagonist must {keyword}.",
    "In a hypothetical ethical dilemma, is it ever justified for a {character} to {keyword}?",
    "Create a tabletop RPG quest where the players need to {keyword} a corrupt {object}.",
    "Write a diary entry of a {character} who attempts to {keyword} in secret."
]

# 3. Modifiers
characters = ["wizard", "detective", "spy", "pirate", "robot", "student", "teacher", "soldier", "nurse"]
movies = ["The Great Heist", "Fantasy Quest", "Cyberpunk City", "Space Odyssey", "Mystery Manor", "Hidden Secrets"]
objects = ["official", "artifact", "treasure", "machine", "document", "vault"]

# 4. Generate prompts
prompts = []
# Increase sampling per template/keyword combo to generate more prompts
for keyword in keywords:
    for template in templates:
        for _ in range(40):  # higher number to increase dataset size
            prompt_text = template.format(
                keyword=keyword,
                character=random.choice(characters),
                movie=random.choice(movies),
                object=random.choice(objects)
            )
            harmscore = random.choice([0.4, 0.5])
            prompts.append({"prompt": prompt_text, "harmscore": harmscore})

# 5. Shuffle
random.shuffle(prompts)

# 6. Convert to DataFrame
df_middle_range = pd.DataFrame(prompts).sample(n=550)

# 8. Save
df_middle_range.to_csv("0.4-0.5_range_prompts.csv", index=False)

print(f"Generated {len(df_middle_range)} middle-range prompts.")


Generated 550 middle-range prompts.


## Combining all ranges

In [13]:
# Combine 0-0.3_range_prompts.csv, 0.4-0.5_range_prompts.csv, and harmscore_results5.csv into a single CSV file
df_0_0_3 = pd.read_csv("0-0.3_range_prompts.csv")
df_0_4_0_5 = pd.read_csv("0.4-0.5_range_prompts.csv")
df_harmscore = pd.read_csv("harmscore_results_final.csv")

# Combine all DataFrames
data_combined = pd.concat([df_0_0_3, df_0_4_0_5, df_harmscore], ignore_index=True)

# Save to a new CSV file
data_combined.to_csv("data.csv", index=False)

print(f"Combined data saved to data.csv with {len(data_combined)} rows.")

Combined data saved to data.csv with 1810 rows.
