In [3]:
from datasets import load_dataset
import pickle
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


1. Load the dataset from Hugging Face, https://huggingface.co/datasets/LabHC/bias_in_bios

In [4]:
dataset = load_dataset("LabHC/bias_in_bios")



As the data is already split in train, test and dev, we need to concatenate the data to get one whole dataframe.

In [5]:
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df_valid = pd.DataFrame(dataset['dev'])

# Combine all into one DataFrame
df = pd.concat([df_train, df_test, df_valid], ignore_index=True)

print(df.head())  # View first few rows


                                           hard_text  profession  gender
0  He is also the project lead of and major contr...          21       0
1  She is able to assess, diagnose and treat mino...          13       1
2  Prior to law school, Brittni graduated magna c...           2       1
3  He regularly contributes to India’s First Onli...          11       0
4  He completed his medical degree at Northwester...          21       0


In [7]:
df.shape        #size of whole dataset

(396189, 3)

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['hard_text', 'profession', 'gender'],
        num_rows: 257478
    })
    test: Dataset({
        features: ['hard_text', 'profession', 'gender'],
        num_rows: 99069
    })
    dev: Dataset({
        features: ['hard_text', 'profession', 'gender'],
        num_rows: 39642
    })
})


In [9]:
df_sample = df.sample(n=5, random_state=42)

In [10]:
df_sample

Unnamed: 0,hard_text,profession,gender
204648,"He practices in Abilene, Texas and has the pro...",6,0
214358,"Prior to joining Skin Solutions Dermatology, H...",19,1
146258,"She is the author of two poetry collections, ""...",21,1
309728,As the faculty advisor for the Vanderbilt stud...,21,1
247901,She graduated with honors in 2010. Having more...,13,1


Mapping of the occupation with the 'profession' column as described in Bias_in_Bios Dataset documentation.

In [11]:
occupation_mapping = {
    0: "accountant",
    1: "architect",
    2: "attorney",
    3: "chiropractor",
    4: "comedian",
    5: "composer",
    6: "dentist",
    7: "dietitian",
    8: "dj",
    9: "filmmaker",
    10: "interior_designer",
    11: "journalist",
    12: "model",
    13: "nurse",
    14: "painter",
    15: "paralegal",
    16: "pastor",
    17: "personal_trainer",
    18: "photographer",
    19: "physician",
    20: "poet",
    21: "professor",
    22: "psychologist",
    23: "rapper",
    24: "software_engineer",
    25: "surgeon",
    26: "teacher",
    27: "yoga_teacher"
}

Mapping of 'gender' column with 'Male' and 'Female'

In [12]:
gender_mapping = {
    0: "Male",
    1: "Female"
}

In [13]:
df['profession'] = df['profession'].map(occupation_mapping)
df['gender'] = df['gender'].map(gender_mapping)


In [14]:
df.head()

Unnamed: 0,hard_text,profession,gender
0,He is also the project lead of and major contr...,professor,Male
1,"She is able to assess, diagnose and treat mino...",nurse,Female
2,"Prior to law school, Brittni graduated magna c...",attorney,Female
3,He regularly contributes to India’s First Onli...,journalist,Male
4,He completed his medical degree at Northwester...,professor,Male


In [15]:
# Number of samples per occupation
num_samples = 10 

# Loop through each unique occupation
for profession in df['profession'].unique():
    # Filter dataset for the specific profession
    df_filtered = df[df['profession'] == profession]
    
    # Sample N records 
    sampled_df = df_filtered.sample(n=min(num_samples, len(df_filtered)), random_state=42)
    
    # Save as CSV
    filename = f"{profession.replace(' ', '_').lower()}_samples.csv"
    sampled_df.to_csv(filename, index=False)
    
    print(f"Saved {len(sampled_df)} samples for {profession} in {filename}")


Saved 10 samples for professor in professor_samples.csv
Saved 10 samples for nurse in nurse_samples.csv
Saved 10 samples for attorney in attorney_samples.csv
Saved 10 samples for journalist in journalist_samples.csv
Saved 10 samples for poet in poet_samples.csv
Saved 10 samples for surgeon in surgeon_samples.csv
Saved 10 samples for teacher in teacher_samples.csv
Saved 10 samples for psychologist in psychologist_samples.csv
Saved 10 samples for physician in physician_samples.csv
Saved 10 samples for pastor in pastor_samples.csv
Saved 10 samples for painter in painter_samples.csv
Saved 10 samples for photographer in photographer_samples.csv
Saved 10 samples for software_engineer in software_engineer_samples.csv
Saved 10 samples for composer in composer_samples.csv
Saved 10 samples for dentist in dentist_samples.csv
Saved 10 samples for yoga_teacher in yoga_teacher_samples.csv
Saved 10 samples for personal_trainer in personal_trainer_samples.csv
Saved 10 samples for dietitian in dietitia

In [17]:
df1 = pd.read_csv('architect_samples.csv')
df1.head()

Unnamed: 0,hard_text,profession,gender
0,"He is a Salesforce MVP, with 5 Salesforce cert...",architect,Male
1,Her preferred alternate transportation is the ...,architect,Female
2,She leads the design and development of machin...,architect,Female
3,She strives to use her skills to benefit her c...,architect,Female
4,He received his Bachelor of Arts in Architectu...,architect,Male
