### Bias Detection and Handling

In [81]:
import pandas as pd
df = pd.read_pickle('Data/df_cleaned.pkl')

In [82]:
df.head()

Unnamed: 0,Had_alcohol_in_the_past,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Gender,Age,Race_Ethnicity,Country_of_Birth,...,Number_of_hours_of_sleep,Cholestrol_level,SystolicBP,DiastolicBP,Pulse,BODY_MEASURE_COMPOSITE,blood_macros,mean_steroid_ng_dl,functional_difficulty_composite,Age_Group
0,Yes,Excellent,Yes,4.7,15.7,259.0,Male,43.0,Other Race - Including Multi-Racial,Unknown/NA,...,9.5,264.0,132.666667,96.0,81.0,64.34,50.31675,155.841667,No difficulty,31-45
1,Yes,Moderate hearing trouble,Yes,6.3,15.2,221.0,Male,66.0,Non-Hispanic White,Born in 50 US states or Washington,...,9.0,214.0,117.0,78.666667,72.0,67.6,52.49825,104.99,Some difficulty,61-75
2,Yes,Moderate hearing trouble,No,5.7,13.8,235.0,Female,44.0,Other Hispanic,Unknown/NA,...,8.0,187.0,109.0,78.333333,81.333333,60.36,43.3125,135.308333,A lot of difficulty,31-45
3,Yes,Good,No,6.8,13.9,264.5,Male,43.0,Mexican American,Unknown/NA,...,7.5,186.0,113.666667,74.333333,72.0,62.64,47.543875,103.591667,Some difficulty,31-45
4,Yes,Good,No,6.5,14.0,241.0,Female,65.0,Non-Hispanic White,Born in 50 US states or Washington,...,8.0,188.0,125.666667,74.0,69.333333,63.1,47.159,40.313333,No difficulty,61-75


In [83]:
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].unique())
    print("-" * 50)

Column: Had_alcohol_in_the_past
['Yes', 'No', 'Unknown/NA']
Categories (3, object): ['Unknown/NA', 'Yes', 'No']
--------------------------------------------------
Column: General_hearing_condition
['Excellent', 'Moderate hearing trouble', 'Good', 'A little trouble', 'A lot of trouble', 'Deaf', 'Don't know']
Categories (7, object): ['Excellent', 'Good', 'A little trouble', 'Moderate hearing trouble', 'A lot of trouble', 'Deaf', 'Don't know']
--------------------------------------------------
Column: Had_high_blood_pressure
['Yes', 'No', 'Unknown/NA']
Categories (3, object): ['Unknown/NA', 'Yes', 'No']
--------------------------------------------------
Column: WBC
[ 4.7   6.3   5.7   6.8   6.5   5.5   6.    8.3   5.8   9.7   9.5   5.6
  5.4   6.6  10.    6.2   4.5   8.5   6.7   5.3   6.4   9.3   4.    6.45
  8.    8.8   8.1  10.4   8.2  14.3   7.8   5.9   7.    8.7   7.7   5.
 12.1   5.2   7.1   6.1   7.6  10.3  13.3  11.9  11.8   4.9   7.4   4.8
  7.5  10.6  13.9  13.4   3.9  10.1   4.6

Check 1: Demographic Representation (Visualization)
Purpose: Ensure no group is severely underrepresented

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Assuming your dataframe is named 'df'
# df = pd.read_csv('your_nhanes_data.csv')

def analyze_representation_bias(df):
    """
    Analyze demographic representation to identify potential sampling bias
    """
    print("=" * 80)
    print("1. REPRESENTATION BIAS ANALYSIS")
    print("=" * 80)
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group', 'Education_Level', 
                        'Marital_Status', 'Country_of_Birth']
    
    for col in demographic_cols:
        if col in df.columns:
            print(f"\n{col} Distribution:")
            print("-" * 40)
            counts = df[col].value_counts(dropna=False)
            percentages = df[col].value_counts(normalize=True, dropna=False) * 100
            
            result_df = pd.DataFrame({
                'Count': counts,
                'Percentage': percentages.round(2)
            })
            print(result_df)
            
            # Check for severe underrepresentation (< 5%)
            underrepresented = percentages[percentages < 5]
            if len(underrepresented) > 0:
                print(f"\n⚠️  WARNING: Underrepresented groups (< 5%):")
                for group, pct in underrepresented.items():
                    print(f"   - {group}: {pct:.2f}%")


def analyze_outcome_disparities(df):
    """
    Analyze health outcome disparities across demographic groups
    """
    print("\n" + "=" * 80)
    print("3. HEALTH OUTCOME DISPARITY ANALYSIS")
    print("=" * 80)
    
    # Conditions to analyze
    conditions = ['Has_diabetes', 'Has_high_blood_pressure', 'Had_heart_attack', 
                  'Had_Cancer', 'Had_Asthma', 'General_health_condition']
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group']
    
    for condition in conditions:
        if condition not in df.columns:
            continue
            
        print(f"\n{condition}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            if condition == 'General_health_condition':
                # For ordinal health condition, calculate mean rating
                health_order = ['Excellent', 'Very good', 'Good', 'Fair', 'Poor']
                temp_df = df[df[condition].isin(health_order)].copy()
                temp_df['health_score'] = temp_df[condition].map(
                    {h: i for i, h in enumerate(health_order)}
                )
                by_group = temp_df.groupby(demo_col)['health_score'].mean()
                print(f"\n  {demo_col} (lower score = better health):")
                print(f"  {by_group.round(2).to_dict()}")
            else:
                # For binary conditions
                by_group = df.groupby(demo_col)[condition].apply(
                    lambda x: (x == 'Yes').sum() / x.notna().sum() * 100
                )
                print(f"\n  {demo_col} (% with condition):")
                print(f"  {by_group.round(2).to_dict()}")


def analyze_continuous_health_metrics(df):
    """
    Analyze continuous health metrics for disparities
    """
    print("\n" + "=" * 80)
    print("4. CONTINUOUS HEALTH METRICS ANALYSIS")
    print("=" * 80)
    
    metrics = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
               'HDL_mg', 'WBC', 'Haemoglobin']
    demographic_cols = ['Gender', 'Race_Ethnicity']
    
    for metric in metrics:
        if metric not in df.columns:
            continue
            
        print(f"\n{metric}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            print(f"\n  By {demo_col}:")
            by_group = df.groupby(demo_col)[metric].agg(['mean', 'std', 'count'])
            print(f"  {by_group.round(2)}")
            
            # Perform statistical test (ANOVA)
            groups = [group[metric].dropna() for name, group in df.groupby(demo_col)]
            if len(groups) > 1 and all(len(g) > 0 for g in groups):
                f_stat, p_value = stats.f_oneway(*groups)
                if p_value < 0.05:
                    print(f"  ⚠️  Significant difference detected (p={p_value:.4f})")


def analyze_socioeconomic_bias(df):
    """
    Analyze potential socioeconomic biases
    """
    print("\n" + "=" * 80)
    print("5. SOCIOECONOMIC BIAS ANALYSIS")
    print("=" * 80)
    
    if 'Income_to_Poverty_Ratio' in df.columns:
        # Create income categories
        df_temp = df.copy()
        df_temp['Income_Category'] = pd.cut(
            df_temp['Income_to_Poverty_Ratio'],
            bins=[0, 1.3, 3.5, 10],
            labels=['Below Poverty', 'Low Income', 'Above Low Income']
        )
        
        print("\nIncome Category Distribution:")
        print(df_temp['Income_Category'].value_counts(normalize=True) * 100)
        
        # Health insurance coverage by income
        if 'Covered_by_health_insurance' in df.columns:
            print("\nHealth Insurance Coverage by Income:")
            coverage = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['Covered_by_health_insurance'],
                normalize='index'
            ) * 100
            print(coverage.round(2))
        
        # Health outcomes by income
        if 'General_health_condition' in df.columns:
            print("\nGeneral Health by Income:")
            health = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['General_health_condition'],
                normalize='index'
            ) * 100
            print(health.round(2))
    


# Run the analysis
# generate_bias_report(df)

In [85]:
analyze_representation_bias(df)
analyze_outcome_disparities(df)
analyze_continuous_health_metrics(df)
analyze_socioeconomic_bias(df)

1. REPRESENTATION BIAS ANALYSIS

Gender Distribution:
----------------------------------------
        Count  Percentage
Gender                   
Female   5051       54.59
Male     4202       45.41

Race_Ethnicity Distribution:
----------------------------------------
                                     Count  Percentage
Race_Ethnicity                                        
Non-Hispanic White                    5133       55.47
Non-Hispanic Black                    1199       12.96
Other Race - Including Multi-Racial   1153       12.46
Other Hispanic                        1002       10.83
Mexican American                       766        8.28

Age_Group Distribution:
----------------------------------------
           Count  Percentage
Age_Group                   
61-75       2513       27.16
31-45       1755       18.97
46-60       1637       17.69
0-18        1187       12.83
19-30       1172       12.67
76+          989       10.69

Education_Level Distribution:
----------------

In [86]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder

# ============================================================================
# SIMPLE BIAS CORRECTION (KEEPS ALL OTHER COLUMNS INTACT)
# ============================================================================

# Make a copy to avoid modifying original
df_balanced = df.copy()

# Step 1: Oversample underrepresented groups
print("Step 1: Oversampling underrepresented groups...")

target_size = int(len(df_balanced) * 0.08)  # 8% of dataset

# Oversample by Race
race_dfs = []
for race in df_balanced['Race_Ethnicity'].unique():
    race_df = df_balanced[df_balanced['Race_Ethnicity'] == race].copy()
    if len(race_df) < target_size:
        race_df = race_df.sample(n=target_size, replace=True, random_state=42)
    race_dfs.append(race_df)

df_balanced = pd.concat(race_dfs, ignore_index=True)

# Oversample by Education
edu_dfs = []
for edu in df_balanced['Education_Level'].unique():
    edu_df = df_balanced[df_balanced['Education_Level'] == edu].copy()
    if edu == 'Unknown/NA':
        edu_dfs.append(edu_df)
        continue
    if len(edu_df) < target_size:
        edu_df = edu_df.sample(n=target_size, replace=True, random_state=42)
    edu_dfs.append(edu_df)

df_balanced = pd.concat(edu_dfs, ignore_index=True).reset_index(drop=True)

print(f"Dataset size: {len(df)} -> {len(df_balanced)}")

# Step 2: Encode demographics for regression
print("\nStep 2: Encoding demographics...")

le_race = LabelEncoder()
le_gender = LabelEncoder()
le_age = LabelEncoder()

df_balanced['Race_encoded'] = le_race.fit_transform(df_balanced['Race_Ethnicity'].astype(str))
df_balanced['Gender_encoded'] = le_gender.fit_transform(df_balanced['Gender'].astype(str))
df_balanced['Age_encoded'] = le_age.fit_transform(df_balanced['Age_Group'].astype(str))

X_demographics = df_balanced[['Race_encoded', 'Gender_encoded', 'Age_encoded']]

# Step 3: Correct continuous variables using regression
print("\nStep 3: Correcting continuous variables...")

continuous_vars = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
                   'HDL_mg', 'WBC', 'Haemoglobin']

for var in continuous_vars:
    if var not in df_balanced.columns:
        print(f"  ⚠️  {var} not found in dataset, skipping...")
        continue
    
    # Get non-missing data
    mask = df_balanced[var].notna()
    if mask.sum() < 10:
        print(f"  ⚠️  {var} has too few valid values, skipping...")
        continue
    
    X = X_demographics[mask]
    y = df_balanced.loc[mask, var].astype(float)
    
    # Fit regression
    model = LinearRegression()
    model.fit(X, y)
    
    # Get residuals
    predictions = model.predict(X)
    residuals = y - predictions
    
    # Overwrite ONLY this column with residuals
    df_balanced.loc[mask, var] = residuals
    
    print(f"  ✓ {var} corrected in place (R² = {model.score(X, y):.3f})")

# Step 4: Convert and correct binary variables
print("\nStep 4: Converting and correcting binary outcomes...")

binary_vars = ['Has_diabetes', 'Had_heart_attack', 'Had_Cancer', 'Had_Asthma']

for var in binary_vars:
    if var not in df_balanced.columns:
        print(f"  ⚠️  {var} not found in dataset, skipping...")
        continue
    
    print(f"\n  Processing {var}...")
    print(f"    Original unique values: {df_balanced[var].unique()}")
    
    # FORCE convert to numeric - handle ALL possible values
    value_map = {}
    for val in df_balanced[var].unique():
        if pd.isna(val):
            continue
        val_str = str(val).strip().lower()
        if val_str == 'yes':
            value_map[val] = 1.0
        else:  # No, Borderline, Refused, Don't know, etc. all become 0
            value_map[val] = 0.0
    
    # Apply mapping
    df_balanced[var] = df_balanced[var].map(value_map)
    print(f"    Converted to numeric: {df_balanced[var].unique()}")
    
    # Get non-missing data
    mask = df_balanced[var].notna()
    if mask.sum() < 10:
        print(f"    ⚠️  Too few valid values, skipping...")
        continue
    
    X = X_demographics[mask]
    y = df_balanced.loc[mask, var]
    
    # Skip if only one class
    if y.nunique() < 2:
        print(f"    ⚠️  Only one class present, skipping...")
        continue
    
    print(f"    Class distribution: 0={(y==0).sum()}, 1={(y==1).sum()}")
    
    # Fit logistic regression
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X, y.astype(int))
    
    # Get residuals
    pred_probs = model.predict_proba(X)[:, 1]
    residuals = y - pred_probs
    
    # Overwrite ONLY this column with residuals
    df_balanced.loc[mask, var] = residuals
    
    print(f"    ✓ {var} corrected in place")

# Step 5: Handle General_health_condition encoding if needed
print("\nStep 5: Encoding General_health_condition...")

if 'General_health_condition' in df_balanced.columns:
    if df_balanced['General_health_condition'].dtype == 'object':
        print(f"  General_health_condition unique values: {df_balanced['General_health_condition'].unique()}")
        health_map = {'Excellent': 0, 'Very good': 1, 'Good': 2, 
                      'Fair': 3, 'Poor': 4, "Don't know": 2}
        df_balanced['General_health_condition'] = df_balanced['General_health_condition'].map(health_map)

print("\n" + "="*80)
print("BIAS CORRECTION COMPLETE")
print("="*80)
print(f"Total columns in df_balanced: {len(df_balanced.columns)}")
print(f"Modified columns: {continuous_vars + binary_vars + ['General_health_condition']}")
print(f"All other columns remain unchanged")
print("="*80)

# This is your final dataframe with all columns
df_final = df_balanced

Step 1: Oversampling underrepresented groups...
Dataset size: 9253 -> 9694

Step 2: Encoding demographics...

Step 3: Correcting continuous variables...
  ✓ SystolicBP corrected in place (R² = 0.296)
  ✓ DiastolicBP corrected in place (R² = 0.070)
  ✓ Cholestrol_level corrected in place (R² = 0.049)
  ✓ HDL_mg corrected in place (R² = 0.064)
  ✓ WBC corrected in place (R² = 0.006)
  ✓ Haemoglobin corrected in place (R² = 0.184)

Step 4: Converting and correcting binary outcomes...

  Processing Has_diabetes...
    Original unique values: ['No', 'Yes', 'Borderline', 'Don't know']
Categories (4, object): ['Borderline', 'Don't know', 'No', 'Yes']
    Converted to numeric: [0. 1.]
    Class distribution: 0=8525, 1=1169
    ✓ Has_diabetes corrected in place

  Processing Had_heart_attack...
    Original unique values: ['No', 'Yes', 'Unknown/NA']
Categories (3, object): ['Unknown/NA', 'Yes', 'No']
    Converted to numeric: [0. 1.]
    Class distribution: 0=9349, 1=345
    ✓ Had_heart_attack 

In [87]:
df = df_final.copy()

In [88]:
from scipy import stats

print("="*80)
print("BIAS VERIFICATION AFTER CORRECTION")
print("="*80)

# Check continuous variables - means should be near 0 for all groups
continuous_vars = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
                   'HDL_mg', 'WBC', 'Haemoglobin']

print("\nCONTINUOUS VARIABLES - Mean by Race (should all be ~0):")
print("-"*80)
for var in continuous_vars:
    print(f"\n{var}:")
    race_means = df_final.groupby('Race_Ethnicity')[var].mean()
    print(race_means)
    
    # ANOVA test
    groups = [df_final[df_final['Race_Ethnicity']==race][var].dropna() 
              for race in df_final['Race_Ethnicity'].unique()]
    f_stat, p_val = stats.f_oneway(*groups)
    
    if p_val > 0.05:
        print(f"✓ No significant bias (p={p_val:.4f})")
    else:
        print(f"⚠️ Some bias remains (p={p_val:.4f})")

# Check binary variables - means should be near 0 for all groups
binary_vars = ['Has_diabetes', 'Had_heart_attack', 'Had_Cancer', 'Had_Asthma']

print("\n" + "="*80)
print("BINARY VARIABLES (RESIDUALS) - Mean by Race (should all be ~0):")
print("-"*80)
for var in binary_vars:
    print(f"\n{var}:")
    race_means = df_final.groupby('Race_Ethnicity')[var].mean()
    print(race_means)
    
    # ANOVA test
    groups = [df_final[df_final['Race_Ethnicity']==race][var].dropna() 
              for race in df_final['Race_Ethnicity'].unique()]
    f_stat, p_val = stats.f_oneway(*groups)
    
    if p_val > 0.05:
        print(f"✓ No significant bias (p={p_val:.4f})")
    else:
        print(f"⚠️ Some bias remains (p={p_val:.4f})")

BIAS VERIFICATION AFTER CORRECTION

CONTINUOUS VARIABLES - Mean by Race (should all be ~0):
--------------------------------------------------------------------------------

SystolicBP:
Race_Ethnicity
Mexican American                      -0.278362
Other Hispanic                        -0.222600
Non-Hispanic White                    -0.863192
Non-Hispanic Black                     2.758136
Other Race - Including Multi-Racial    1.335065
Name: SystolicBP, dtype: float64
⚠️ Some bias remains (p=0.0000)

DiastolicBP:
Race_Ethnicity
Mexican American                      -0.998957
Other Hispanic                        -0.649667
Non-Hispanic White                    -0.276485
Non-Hispanic Black                     1.930308
Other Race - Including Multi-Racial    0.571827
Name: DiastolicBP, dtype: float64
⚠️ Some bias remains (p=0.0000)

Cholestrol_level:
Race_Ethnicity
Mexican American                       1.547090
Other Hispanic                        -3.279460
Non-Hispanic White           