In [8]:
from datasets import load_dataset
import re
import json

In [2]:
dataset = load_dataset("GBaker/MedQA-USMLE-4-options-hf", split="test")

## Patient Gender

In [9]:
# Define subject-related keywords and pronouns
gender_keywords = {
    "man": "male",
    "woman": "female",
    "male": "male",
    "female": "female",
    "boy": "male",
    "girl": "female",
}
pronoun_map = {
    "male": {"he": "she", "his": "her", "him": "her"},
    "female": {"she": "he", "her": "his", "hers": "his"},
}

# Function to extract the subject keyword and determine gender
def extract_gender(question):
    # Use regex to find gender-related keywords
    pattern = r'\b(?:' + '|'.join(gender_keywords.keys()) + r')\b'
    match = re.search(pattern, question.lower())
    if match:
        return gender_keywords[match.group().lower()]
        
    # If no direct gender keyword found, look for gender-specific pronouns
    he_pattern = r'\b(?:he|his|him)\b'
    she_pattern = r'\b(?:she|her|hers)\b'
    
    he_matches = len(re.findall(he_pattern, question.lower()))
    she_matches = len(re.findall(she_pattern, question.lower()))
    
    if he_matches > she_matches:
        return "male"
    elif she_matches > he_matches:
        return "female"
        
    return None  # Return None if no gender indicators found

# Function to replace pronouns based on gender
def replace_pronouns(question, original_gender, target_gender):
    pronouns = pronoun_map[original_gender]
    for orig_pronoun, target_pronoun in pronouns.items():
        question = re.sub(r'\b' + re.escape(orig_pronoun) + r'\b', target_pronoun, question, flags=re.IGNORECASE)
    return question

def match_patient_info_pattern(question):
    """
    Match entries that start with pattern 'A' + age + noun (e.g., "A 22-year-old female")
    Returns the matched pattern if found, None otherwise
    """
    # Pattern to match: 
    # - Starts with 'A' or 'An'
    # - Followed by space and number
    # - Followed by variations of 'year old', 'year-old', etc
    # - Followed by a noun (man/woman/male/female/patient etc)
    # - OR followed by adjective(s) and then a noun (overweight woman)
    pattern = r'^[Aa]n?\s+(\d+)[-\s](?:year|yr)[-\s]?old\s+(?:\w+\s+)?\w+'
    
    match = re.search(pattern, question.strip())
    if match:
        return match.group()
    return None


In [18]:
# Initialize the new dataset
gender_biased_data = []

# Process each entry in the dataset
for entry in dataset:
    # Extract fields
    question = entry['sent1']
    options = [entry['ending0'], entry['ending1'], entry['ending2'], entry['ending3']]
    label = entry['label']
    original_id = entry['id']

    subject = match_patient_info_pattern(question)
    if subject is None:
        continue
            
    # Add to the new dataset
    gender_biased_data.append({
        "ID": original_id,
        "Original Question": question,
        "Original Options": options,
        "Subject": subject,
        "Label": label
    })

In [19]:
len(gender_biased_data)

1046

In [21]:
# Save the dataset to JSON
with open("gender_biased_data.json", "w") as json_file:
    json.dump(gender_biased_data, json_file, indent=4)

print("Demographic biased dataset saved as 'gender_biased_data.json'.")

Demographic biased dataset saved as 'gender_biased_data.json'.


## Demographic

In [3]:

white_keywords = ["white", "caucasian"]
black_keywords = ["black", "african american"]
asian_keywords = ["asian"]
hispanic_keywords = ["hispanic", "latino"]
demographic_keywords = white_keywords + black_keywords + asian_keywords + hispanic_keywords

gender_specific_keywords = ["mother", "pregnant", "father"]

In [4]:
def contains_keywords(text, keywords):
    return any(re.search(rf"\b{keyword}\b", text, re.IGNORECASE) for keyword in keywords)

In [5]:
black_count = 0
white_count = 0
asian_count = 0
hispanic_count = 0

for entry in dataset:
    question = entry["sent1"]
    
    if contains_keywords(question, black_keywords):
        black_count += 1
    elif contains_keywords(question, white_keywords):
        white_count += 1
    elif contains_keywords(question, asian_keywords):
        asian_count += 1
    elif contains_keywords(question, hispanic_keywords):
        hispanic_count += 1

print(f"Total: {len(dataset)}")
print(f"Black: {black_count}")
print(f"White: {white_count}")
print(f"Asian: {asian_count}")
print(f"Hispanic: {hispanic_count}")

Total: 1273
Black: 18
White: 42
Asian: 2
Hispanic: 1
