In [None]:
import csv
import random

# ---------------------------------------------------------------------------
# 1. EXTENDED BIAS CATEGORIES / GROUPS
# ---------------------------------------------------------------------------
bias_subjects = [
    "Women", "Men", "Non-binary individuals", "Older men", "Older women", "Young men", "Young women",
    "Teenagers", "Senior citizens", "Black people", "White people", "Asian people",
    "Hispanic people", "Latinx communities", "Indigenous communities", "Arab people",
    "Persian people", "Turkish people", "Indian people", "Jewish people", "Muslims",
    "Christians", "Hindus", "Buddhists", "Sikhs", "Jains", "Atheists", "Agnostics",
    "LGBTQ+ individuals", "Trans people", "Gay men", "Lesbians", "Bisexual people",
    "Queer people", "People with disabilities", "People with physical disabilities",
    "People with mental health conditions", "Neurodivergent individuals",
    "Autistic individuals", "People with ADHD", "People with dyslexia",
    "Refugees", "Immigrants", "Migrant workers", "Rural communities", "Urban communities",
    "Single parents", "Working mothers", "Stay-at-home fathers",
    "People from low-income backgrounds", "People from wealthy backgrounds",
    "Ex-convicts", "Recovering addicts"
]

# ---------------------------------------------------------------------------
# 2. EXTENDED CONTEXTS OR FIELDS
# ---------------------------------------------------------------------------
contexts = [
    "technology", "finance", "sports", "education", "leadership",
    "politics", "management", "entrepreneurship", "STEM", "innovation",
    "creative arts", "healthcare", "law enforcement", "hospitality",
    "marketing", "entertainment", "military service", "construction",
    "customer service", "academia", "social work", "engineering",
    "consulting", "government", "journalism", "medicine", "philosophy",
    "architecture", "urban planning", "public policy", "theater",
    "comedy", "drama", "food service", "transportation", "logistics",
    "publishing", "public speaking", "agribusiness"
]

# ---------------------------------------------------------------------------
# 3. EXTENDED TEMPLATES FOR BIASED STATEMENTS (INPUT TEXT)
# ---------------------------------------------------------------------------
biased_templates = [
    "{subject} should not pursue {context} because they are incapable.",
    "{subject} are too emotional for {context}.",
    "Only {subject} can succeed in {context}.",
    "{subject} are not suited for {context} due to their background.",
    "{subject} lack the discipline needed to excel in {context}.",
    "{subject} are not intelligent enough for {context}.",
    "{subject} do not have the creativity required for {context}.",
    "{subject} can't be trusted in {context}.",
    "{subject} have no place in {context}.",
    "{subject} will fail in {context} because of who they are.",
    "{subject} aren't naturally inclined toward {context}.",
    "{subject} are too traditional-minded for {context}.",
    "{subject} will ruin {context} if allowed to participate.",
    "{subject} are too aggressive for {context}.",
    "{subject} are a bad fit for {context} roles.",
    "{subject} should never be considered for {context} opportunities.",
    "Employers should exclude {subject} from {context}.",
    "{subject} do not deserve leadership roles in {context}.",
    "{subject} will compromise the integrity of {context}.",
    "{subject} are inherently unqualified for {context}."
]

# ---------------------------------------------------------------------------
# 4. EXTENDED TEMPLATES FOR UNBIASED RESPONSES (TARGET TEXT)
# ---------------------------------------------------------------------------
unbiased_templates = [
    "Success in {context} depends on dedication and skill, not personal background.",
    "Aptitude for {context} is driven by effort and learning, not stereotypes.",
    "Performance in {context} is determined by commitment, not identity.",
    "Anyone can excel in {context} with the proper training and mindset.",
    "Ability in {context} is a matter of hard work and passion, not group labels.",
    "Personal drive and perseverance lead to {context} success, regardless of background.",
    "Merit, determination, and opportunity shape outcomes in {context}, not biases.",
    "Talent for {context} can be found in individuals of all backgrounds.",
    "With appropriate support and practice, anyone can thrive in {context}.",
    "Skill in {context} arises from effort and experience, not demographic factors.",
    "All individuals should be judged by their capabilities in {context}, not prejudice.",
    "Proficiency in {context} is a result of training and dedication, not inherent traits.",
    "Expertise in {context} can develop in anyone who puts in the work.",
    "There is no single identity that guarantees or prevents success in {context}.",
    "Anyone willing to learn can master {context}, regardless of who they are.",
    "Achieving excellence in {context} is about determination, not demographics.",
    "A person's potential in {context} is shaped by dedication and support systems.",
    "Barriers in {context} can be overcome through inclusion and equal opportunity.",
    "Positive outcomes in {context} come from skill-building, not exclusion.",
    "It is unfair to judge competence in {context} based on stereotypes."
]

# ---------------------------------------------------------------------------
# 5. GENERATION & VALIDATION LOGIC
# ---------------------------------------------------------------------------
NUM_ROWS = 40000
OUTPUT_FILE = "biased_unbiased_dataset_50000.csv"
CHECKPOINT_LIMIT = 10  # Check after 1000 iterations

def generate_biased_unbiased_pair():
    subject = random.choice(bias_subjects)
    context = random.choice(contexts)
    biased_sentence = random.choice(biased_templates).format(subject=subject, context=context)
    unbiased_sentence = random.choice(unbiased_templates).format(context=context)
    return biased_sentence, unbiased_sentence

def sense_check(biased_text, unbiased_text):
    if "{subject}" in biased_text or "{context}" in biased_text:
        return False
    if "{subject}" in unbiased_text or "{context}" in unbiased_text:
        return False
    if biased_text.strip() == unbiased_text.strip():
        return False
    if len(biased_text.strip()) < 10 or len(unbiased_text.strip()) < 10:
        return False
    return True

def save_to_csv(records):
    with open(OUTPUT_FILE, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["input_text", "target_text"])
        writer.writerows(records)
    print(f"Checkpoint saved: {len(records)} records written.")

def main():
    used_biased_texts = set()
    records = []
    stagnant_counter = 0

    while len(records) < NUM_ROWS:
        biased, unbiased = generate_biased_unbiased_pair()

        if not sense_check(biased, unbiased) or biased in used_biased_texts:
            stagnant_counter += 1
        else:
            used_biased_texts.add(biased)
            records.append((biased, unbiased))
            stagnant_counter = 0  # Reset counter on successful addition

        if stagnant_counter >= CHECKPOINT_LIMIT:
            print("No new records added for a while. Saving progress...")
            save_to_csv(records)
            stagnant_counter = 0  # Reset stagnant counter

    save_to_csv(records)
    print(f"CSV file '{OUTPUT_FILE}' created successfully with {len(records)} rows.")

if __name__ == "__main__":
    main()



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
No new records added for a while. Saving progress...
Checkpoint saved: 38419 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38420 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38420 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38420 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38422 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38427 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38428 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38429 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38430 records written.
No new records added for a while. Saving progress...
Checkpoint saved: 38430 records writ