### Bias Detection and Handling

In [1]:
# import pandas as pd
# df = pd.read_pickle('Data/df_cleaned.pkl')

import sys
import numpy as np
import types
import pandas as pd

df = pd.read_csv('Data/df_cleaned_v2.csv', index_col=0)


In [2]:
df.shape

(9183, 44)

In [3]:
df['blood_macros'].unique()

array([50.31675, 52.49825, 43.3125 , ..., 42.6405 , 49.97425, 44.17325])

In [4]:
df.head()

Unnamed: 0_level_0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Gender,Age,Race_Ethnicity,Country_of_Birth,Education_Level,...,Cholestrol_level,SystolicBP,DiastolicBP,Pulse,BODY_MEASURE_COMPOSITE,blood_macros,mean_steroid_ng_dl,functional_difficulty_composite,Age_Group,zero_count
Had_alcohol_in_the_past,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Yes,Excellent,Yes,4.7,15.7,259.0,Male,43.0,Other Race - Including Multi-Racial,Unknown/NA,College graduate or above,...,264.0,132.666667,96.0,81.0,64.34,50.31675,155.841667,No difficulty,31-45,0
Yes,Moderate hearing trouble,Yes,6.3,15.2,221.0,Male,66.0,Non-Hispanic White,Born in 50 US states or Washington,College graduate or above,...,214.0,117.0,78.666667,72.0,67.6,52.49825,104.99,Some difficulty,61-75,0
Yes,Moderate hearing trouble,No,5.7,13.8,235.0,Female,44.0,Other Hispanic,Unknown/NA,High school graduate/GED or equivalent,...,187.0,109.0,78.333333,81.333333,60.36,43.3125,135.308333,A lot of difficulty,31-45,0
Yes,Good,No,6.8,13.9,264.5,Male,43.0,Mexican American,Unknown/NA,9-11th grade (Includes 12th grade with no dipl...,...,186.0,113.666667,74.333333,72.0,62.64,47.543875,103.591667,Some difficulty,31-45,0
Yes,Good,No,6.5,14.0,241.0,Female,65.0,Non-Hispanic White,Born in 50 US states or Washington,High school graduate/GED or equivalent,...,188.0,125.666667,74.0,69.333333,63.1,47.159,40.313333,No difficulty,61-75,0


Check 1: Demographic Representation (Visualization)
Purpose: Ensure no group is severely underrepresented

In [5]:
df['functional_difficulty_composite'].unique()

array(['No difficulty', 'Some difficulty', 'A lot of difficulty',
       'Unknown/NA', 'Cannot do at all'], dtype=object)

In [6]:
df.columns

Index(['General_hearing_condition', 'Had_high_blood_pressure', 'WBC',
       'Haemoglobin', 'Platelete', 'Gender', 'Age', 'Race_Ethnicity',
       'Country_of_Birth', 'Education_Level', 'Marital_Status',
       'Household_Size', 'Income_to_Poverty_Ratio', 'Has_diabetes',
       'Takes_vitamin_supplements', 'Has_Disability', 'HDL_mg',
       'Has_Hepatitis', 'Covered_by_health_insurance', 'Tested_for_HIV_Virus',
       'General_health_condition', 'Received_Hepatitis_A_Vaccine',
       'Family_poverty_level_index', 'Has_Kidney_Failure', 'Had_Asthma',
       'Had_Arthritis', 'Had_heart_attack', 'Had_Thyroid',
       'Had_Liver_COndition', 'Had_Cancer', 'Teeth_and_gum_health',
       'Number_of_Moderate_Physical_activities_per_week',
       'Number_of_Vigorous_Physical_activities_per_week',
       'Number_of_hours_of_sleep', 'Cholestrol_level', 'SystolicBP',
       'DiastolicBP', 'Pulse', 'BODY_MEASURE_COMPOSITE', 'blood_macros',
       'mean_steroid_ng_dl', 'functional_difficulty_composit

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Assuming your dataframe is named 'df'
# df = pd.read_csv('your_nhanes_data.csv')

def analyze_representation_bias(df):
    """
    Analyze demographic representation to identify potential sampling bias
    """
    print("=" * 80)
    print("1. REPRESENTATION BIAS ANALYSIS")
    print("=" * 80)
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group', 'Education_Level', 
                        'Marital_Status', 'Country_of_Birth']
    
    for col in demographic_cols:
        if col in df.columns:
            print(f"\n{col} Distribution:")
            print("-" * 40)
            counts = df[col].value_counts(dropna=False)
            percentages = df[col].value_counts(normalize=True, dropna=False) * 100
            
            result_df = pd.DataFrame({
                'Count': counts,
                'Percentage': percentages.round(2)
            })
            print(result_df)
            
            # Check for severe underrepresentation (< 5%)
            underrepresented = percentages[percentages < 5]
            if len(underrepresented) > 0:
                print(f"\n⚠️  WARNING: Underrepresented groups (< 5%):")
                for group, pct in underrepresented.items():
                    print(f"   - {group}: {pct:.2f}%")


def analyze_outcome_disparities(df):
    """
    Analyze health outcome disparities across demographic groups
    """
    print("\n" + "=" * 80)
    print("3. HEALTH OUTCOME DISPARITY ANALYSIS")
    print("=" * 80)
    
    # Conditions to analyze
    conditions = ['Has_diabetes', 'Has_high_blood_pressure', 'Had_heart_attack', 
                  'Had_Cancer', 'Had_Asthma', 'General_health_condition']
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group']
    
    for condition in conditions:
        if condition not in df.columns:
            continue
            
        print(f"\n{condition}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            if condition == 'General_health_condition':
                # For ordinal health condition, calculate mean rating
                health_order = ['Excellent', 'Very good', 'Good', 'Fair', 'Poor']
                temp_df = df[df[condition].isin(health_order)].copy()
                temp_df['health_score'] = temp_df[condition].map(
                    {h: i for i, h in enumerate(health_order)}
                )
                by_group = temp_df.groupby(demo_col)['health_score'].mean()
                print(f"\n  {demo_col} (lower score = better health):")
                print(f"  {by_group.round(2).to_dict()}")
            else:
                # For binary conditions
                by_group = df.groupby(demo_col)[condition].apply(
                    lambda x: (x == 'Yes').sum() / x.notna().sum() * 100
                )
                print(f"\n  {demo_col} (% with condition):")
                print(f"  {by_group.round(2).to_dict()}")


def analyze_continuous_health_metrics(df):
    """
    Analyze continuous health metrics for disparities
    """
    print("\n" + "=" * 80)
    print("4. CONTINUOUS HEALTH METRICS ANALYSIS")
    print("=" * 80)
    
    metrics = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
               'HDL_mg', 'WBC', 'Haemoglobin']
    demographic_cols = ['Gender', 'Race_Ethnicity']
    
    for metric in metrics:
        if metric not in df.columns:
            continue
            
        print(f"\n{metric}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            print(f"\n  By {demo_col}:")
            by_group = df.groupby(demo_col)[metric].agg(['mean', 'std', 'count'])
            print(f"  {by_group.round(2)}")
            
            # Perform statistical test (ANOVA)
            groups = [group[metric].dropna() for name, group in df.groupby(demo_col)]
            if len(groups) > 1 and all(len(g) > 0 for g in groups):
                f_stat, p_value = stats.f_oneway(*groups)
                if p_value < 0.05:
                    print(f"  ⚠️  Significant difference detected (p={p_value:.4f})")


def analyze_socioeconomic_bias(df):
    """
    Analyze potential socioeconomic biases
    """
    print("\n" + "=" * 80)
    print("5. SOCIOECONOMIC BIAS ANALYSIS")
    print("=" * 80)
    
    if 'Income_to_Poverty_Ratio' in df.columns:
        # Create income categories
        df_temp = df.copy()
        df_temp['Income_Category'] = pd.cut(
            df_temp['Income_to_Poverty_Ratio'],
            bins=[0, 1.3, 3.5, 10],
            labels=['Below Poverty', 'Low Income', 'Above Low Income']
        )
        
        print("\nIncome Category Distribution:")
        print(df_temp['Income_Category'].value_counts(normalize=True) * 100)
        
        # Health insurance coverage by income
        if 'Covered_by_health_insurance' in df.columns:
            print("\nHealth Insurance Coverage by Income:")
            coverage = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['Covered_by_health_insurance'],
                normalize='index'
            ) * 100
            print(coverage.round(2))
        
        # Health outcomes by income
        if 'General_health_condition' in df.columns:
            print("\nGeneral Health by Income:")
            health = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['General_health_condition'],
                normalize='index'
            ) * 100
            print(health.round(2))
    


# Run the analysis
# generate_bias_report(df)

In [8]:
analyze_representation_bias(df)
analyze_outcome_disparities(df)
analyze_continuous_health_metrics(df)
analyze_socioeconomic_bias(df)

1. REPRESENTATION BIAS ANALYSIS

Gender Distribution:
----------------------------------------
        Count  Percentage
Gender                   
Female   5012       54.58
Male     4171       45.42

Race_Ethnicity Distribution:
----------------------------------------
                                     Count  Percentage
Race_Ethnicity                                        
Non-Hispanic White                    5090       55.43
Non-Hispanic Black                    1187       12.93
Other Race - Including Multi-Racial   1142       12.44
Other Hispanic                        1000       10.89
Mexican American                       764        8.32

Age_Group Distribution:
----------------------------------------
           Count  Percentage
Age_Group                   
61-75       2494       27.16
31-45       1746       19.01
46-60       1624       17.68
0-18        1187       12.93
19-30       1161       12.64
76+          971       10.57

Education_Level Distribution:
----------------

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder

# ============================================================================
# TARGETED OVERSAMPLING FOR UNDERREPRESENTED GROUPS
# ============================================================================

print("Step 1: Targeted oversampling for underrepresented groups...")

# Define target percentages for underrepresented groups
target_percentages = {
    'Race_Ethnicity': {
        'Mexican American': 0.10  # Target 10%
    },
    'Education_Level': {
        'Less than 9th grade': 0.10  # Target 10%
    }
}

df_oversampled = df.copy()

# Oversample Race_Ethnicity groups
for group, target_pct in target_percentages['Race_Ethnicity'].items():
    current_count = len(df[df['Race_Ethnicity'] == group])
    current_pct = current_count / len(df)
    
    if current_pct < target_pct:
        # Calculate how many additional samples needed
        target_count = int(len(df) * target_pct)
        additional_samples = target_count - current_count
        
        # Oversample with replacement
        group_data = df[df['Race_Ethnicity'] == group]
        oversampled = group_data.sample(n=additional_samples, replace=True, random_state=42)
        df_oversampled = pd.concat([df_oversampled, oversampled], ignore_index=True)
        
        print(f"Oversampled '{group}': {current_count} → {target_count} ({current_pct:.2%} → {target_pct:.2%})")

# Oversample Education_Level groups
for group, target_pct in target_percentages['Education_Level'].items():
    current_count = len(df_oversampled[df_oversampled['Education_Level'] == group])
    current_pct = current_count / len(df_oversampled)
    
    if current_pct < target_pct:
        # Calculate based on the new size after previous oversampling
        target_count = int(len(df_oversampled) * target_pct)
        additional_samples = target_count - current_count
        
        # Oversample with replacement
        group_data = df_oversampled[df_oversampled['Education_Level'] == group]
        oversampled = group_data.sample(n=additional_samples, replace=True, random_state=42)
        df_oversampled = pd.concat([df_oversampled, oversampled], ignore_index=True)
        
        print(f"Oversampled '{group}': {current_count} → {target_count} ({current_pct:.2%} → {target_pct:.2%})")

print(f"\nOriginal df shape: {df.shape}")
print(f"Oversampled df shape: {df_oversampled.shape}")

# Verify new distributions
print("\n" + "="*50)
print("NEW DISTRIBUTIONS AFTER OVERSAMPLING")
print("="*50)
print("\nRace_Ethnicity Distribution:")
print(df_oversampled['Race_Ethnicity'].value_counts())
print("\nEducation_Level Distribution:")
print(df_oversampled['Education_Level'].value_counts())

df_oversampled.head()

Step 1: Targeted oversampling for underrepresented groups...
Oversampled 'Mexican American': 764 → 918 (8.32% → 10.00%)
Oversampled 'Less than 9th grade': 396 → 933 (4.24% → 10.00%)

Original df shape: (9183, 44)
Oversampled df shape: (9874, 44)

NEW DISTRIBUTIONS AFTER OVERSAMPLING

Race_Ethnicity Distribution:
Race_Ethnicity
Non-Hispanic White                     5191
Non-Hispanic Black                     1235
Other Race - Including Multi-Racial    1193
Other Hispanic                         1157
Mexican American                       1098
Name: count, dtype: int64

Education_Level Distribution:
Education_Level
Some college or AA degree                             2655
College graduate or above                             2625
High school graduate/GED or equivalent                1753
Unknown/NA                                            1232
Less than 9th grade                                    933
9-11th grade (Includes 12th grade with no diploma)     676
Name: count, dtype: int6

Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Gender,Age,Race_Ethnicity,Country_of_Birth,Education_Level,...,Cholestrol_level,SystolicBP,DiastolicBP,Pulse,BODY_MEASURE_COMPOSITE,blood_macros,mean_steroid_ng_dl,functional_difficulty_composite,Age_Group,zero_count
0,Excellent,Yes,4.7,15.7,259.0,Male,43.0,Other Race - Including Multi-Racial,Unknown/NA,College graduate or above,...,264.0,132.666667,96.0,81.0,64.34,50.31675,155.841667,No difficulty,31-45,0
1,Moderate hearing trouble,Yes,6.3,15.2,221.0,Male,66.0,Non-Hispanic White,Born in 50 US states or Washington,College graduate or above,...,214.0,117.0,78.666667,72.0,67.6,52.49825,104.99,Some difficulty,61-75,0
2,Moderate hearing trouble,No,5.7,13.8,235.0,Female,44.0,Other Hispanic,Unknown/NA,High school graduate/GED or equivalent,...,187.0,109.0,78.333333,81.333333,60.36,43.3125,135.308333,A lot of difficulty,31-45,0
3,Good,No,6.8,13.9,264.5,Male,43.0,Mexican American,Unknown/NA,9-11th grade (Includes 12th grade with no dipl...,...,186.0,113.666667,74.333333,72.0,62.64,47.543875,103.591667,Some difficulty,31-45,0
4,Good,No,6.5,14.0,241.0,Female,65.0,Non-Hispanic White,Born in 50 US states or Washington,High school graduate/GED or equivalent,...,188.0,125.666667,74.0,69.333333,63.1,47.159,40.313333,No difficulty,61-75,0


In [10]:
df = df_oversampled.copy()

## Encoding

In [11]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder


# Nominal categorical → One-Hot
ohe_cols = [
    "Race_Ethnicity",
    "Gender",
    "Country_of_Birth",
    "Marital_Status"
]

# Binary categorical → map to 0/1/2 (Unknown as 2)
binary_cols = [
    "Covered_by_health_insurance",
    # "Had_alcohol_in_the_past",
    "Has_Kidney_Failure",
    "Had_high_blood_pressure",
    "Takes_vitamin_supplements",
    "Tested_for_HIV_Virus",
    "Has_diabetes",
    "Had_Asthma",
    "Had_Arthritis",
    "Had_heart_attack",
    "Had_Thyroid",
    "Had_Liver_COndition",
    "Had_Cancer",
    "Has_Hepatitis",
    "Has_Disability",
    "Received_Hepatitis_A_Vaccine"
]

# Ordinal categorical → ordered encoding + Unknown at the end
ordinal_cols = {
    "Age_Group": [["0-18", "19-30", "31-45", "46-60", "61-75", "76+", "Unknown"]],
    "Education_Level": [["Less than 9th grade",
                         "9-11th grade (Includes 12th grade with no diploma)",
                         "High school graduate/GED or equivalent",
                         "Some college or AA degree",
                         "College graduate or above",
                         "Unknown"]],
    "General_health_condition": [["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"]],
    "General_hearing_condition": [["Deaf", "A lot of trouble", "Moderate hearing trouble",
                                   "A little trouble", "Good", "Excellent", "Unknown"]],
    "functional_difficulty_composite": [['Cannot do at all', 'A lot of difficulty', 'No difficulty', 'Some difficulty','Unknown']],
    "Teeth_and_gum_health": [["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"]]
}

# --------------------------
# Encoding functions
# --------------------------

def encode_dataset(df):
    df_encoded = df.copy()

    # Normalize all Unknown-like responses into "Unknown"
    df_encoded = df_encoded.replace(
        {"Don't know": "Unknown", "Refused": "Unknown", "Not Applicable": "Unknown", "N/A": "Unknown", "Unknown/NA": "Unknown"}
    )

    # Binary encoding: map Yes/No/Unknown
    for col in binary_cols:
        if col == "Has_diabetes":
            df_encoded[col] = df_encoded[col].map({"No": 0, "Yes": 1, "Borderline": 2, "Unknown": 2})
        elif col == "Received_Hepatitis_A_Vaccine":
            df_encoded[col] = df_encoded[col].map({"No doses": 0, "Yes, at least 2 doses": 1, "Less than 2 doses": 1, "Unknown": 2, })
        else:
            df_encoded[col] = df_encoded[col].map({"No": 0, "Yes": 1, "Unknown": 2})

    # Ordinal encoding
    for col, categories in ordinal_cols.items():
        if col in df_encoded.columns:
            encoder = OrdinalEncoder(categories=categories, dtype=int)
            df_encoded[col] = encoder.fit_transform(df_encoded[[col]])

    # One-Hot encoding
    df_encoded = pd.get_dummies(df_encoded, columns=[col for col in ohe_cols if col in df_encoded.columns], drop_first=False, dtype=int)

    return df_encoded

# --------------------------
# Usage
# --------------------------

df_encoded = encode_dataset(df)

print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)
df_encoded.head()


Original shape: (9874, 44)
Encoded shape: (9874, 53)


Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,Has_diabetes,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,5,1,4.7,15.7,259.0,43.0,4,4.0,5.0,0,...,0,1,0,1,0,1,1,0,0,0
1,2,1,6.3,15.2,221.0,66.0,4,2.0,5.0,0,...,0,0,0,1,1,0,1,0,0,0
2,2,0,5.7,13.8,235.0,44.0,2,7.0,1.41,1,...,1,0,1,0,0,1,1,0,0,0
3,4,0,6.8,13.9,264.5,43.0,1,2.0,0.63,0,...,0,0,0,1,0,1,0,1,0,0
4,4,0,6.5,14.0,241.0,65.0,2,2.0,5.0,0,...,0,0,1,0,1,0,1,0,0,0


In [12]:
for col in df_encoded.columns:
    print(f"Column: {col}")
    print(df_encoded[col].unique())
    print("-" * 50)

Column: General_hearing_condition
[5 2 4 3 1 0 6]
--------------------------------------------------
Column: Had_high_blood_pressure
[1 0 2]
--------------------------------------------------
Column: WBC
[ 4.7   6.3   5.7   6.8   6.5   5.5   6.    8.3   5.8   9.7   9.5   5.6
  5.4   6.6  10.    6.2   4.5   8.5   6.7   5.3   6.4   9.3   4.    6.45
  8.    8.8   8.1  10.4   8.2  14.3   7.8   5.9   7.    8.7   7.7   5.
 12.1   5.2   7.1   6.1   7.6  10.3  13.3  11.9  11.8   4.9   7.4   4.8
  7.5  10.6  13.9  13.4   3.9  10.1   4.6   7.2  13.7  11.3   6.9   4.2
  7.3  12.9   5.1  15.4   9.2  10.8  10.5   3.4  15.1   8.4   3.5  12.2
  9.9   7.9  11.7   4.4   9.1   9.6  10.2   9.8   4.1  11.4   8.6  13.
 12.8   9.    4.3  15.6   9.4   8.9   3.   13.2  11.2   2.7  11.1   3.8
 13.5  12.6  10.9  17.4  10.7  11.5  17.5  12.7  11.6  12.    3.7  14.5
  3.6   2.6   3.1  13.1   3.2   2.5   2.9   2.3  11.   12.4   2.2  14.1
 14.   14.2  12.5  12.3   3.3   2.8  15.9  13.8   2.4  14.7  16.   15.3
 16.7

In [13]:
# verify if encoding is sucessful

for col in binary_cols:
    if col in df_encoded.columns:
        print(col, df_encoded[col].unique())


for col in ordinal_cols.keys():
    if col in df_encoded.columns:
        print(col, df_encoded[col].unique())


[col for col in df_encoded.columns if any(base in col for base in ohe_cols)]

print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)


df = df_encoded

Covered_by_health_insurance [1 0 2]
Has_Kidney_Failure [0 1 2]
Had_high_blood_pressure [1 0 2]
Takes_vitamin_supplements [0 1]
Tested_for_HIV_Virus [0 1 2]
Has_diabetes [0 1 2]
Had_Asthma [0 1 2]
Had_Arthritis [1 0 2]
Had_heart_attack [0 2 1]
Had_Thyroid [0 1 2]
Had_Liver_COndition [0 2 1]
Had_Cancer [0 1 2]
Has_Hepatitis [0 1 2]
Has_Disability [0 1 2]
Received_Hepatitis_A_Vaccine [2 1 0]
Age_Group [2 4 1 3 5 0]
Education_Level [4 2 1 3 0 5]
General_health_condition [4 2 3 1 0 5]
General_hearing_condition [5 2 4 3 1 0 6]
functional_difficulty_composite [2 3 1 4 0]
Teeth_and_gum_health [4 3 1 0 2 5]
Original shape: (9874, 44)
Encoded shape: (9874, 53)


In [14]:

numerical_cols = ["Number_of_Vigorous_Physical_activities_per_week","Number_of_Moderate_Physical_activities_per_week",
                  "mean_steroid_ng_dl","blood_macros","HDL_mg","WBC","Platelete","SystolicBP","Household_Size","Cholestrol_level","Pulse","DiastolicBP","Family_poverty_level_index","Income_to_Poverty_Ratio",
                  "Number_of_hours_of_sleep","BODY_MEASURE_COMPOSITE","Age","Haemoglobin"]

# Skewness
print("\n=== Skewness of Numerical Columns ===")
print(df[numerical_cols].skew().sort_values(ascending=False))



=== Skewness of Numerical Columns ===
Number_of_Vigorous_Physical_activities_per_week    36.547425
Number_of_Moderate_Physical_activities_per_week    18.269649
mean_steroid_ng_dl                                  9.782327
blood_macros                                        6.127865
HDL_mg                                              1.272052
WBC                                                 1.261373
Platelete                                           1.030677
SystolicBP                                          0.989939
Household_Size                                      0.802928
Cholestrol_level                                    0.710040
Pulse                                               0.576913
DiastolicBP                                         0.477006
Family_poverty_level_index                          0.395935
Income_to_Poverty_Ratio                             0.113195
Number_of_hours_of_sleep                            0.072260
BODY_MEASURE_COMPOSITE                        

In [15]:
# Columns to log-transform
# other columns above 0.5 are binary columns
skewed_cols = ["Number_of_Vigorous_Physical_activities_per_week", "Number_of_Moderate_Physical_activities_per_week", "mean_steroid_ng_dl", 
                            "blood_macros", "HDL_mg", "WBC", "Platelete", "SystolicBP", "Household_Size", "Cholestrol_level", "Pulse"]

# Apply log1p safely (handles zeros)
for col in skewed_cols:
    if col in df.columns:
        df[col] = np.log1p(df[col].clip(lower=0))

## Scaling

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select numerical (continuous) columns
numerical_cols = ["Number_of_Vigorous_Physical_activities_per_week","Number_of_Moderate_Physical_activities_per_week",
                  "mean_steroid_ng_dl","blood_macros","HDL_mg","WBC","Platelete","SystolicBP","Household_Size","Cholestrol_level","Pulse","DiastolicBP","Family_poverty_level_index","Income_to_Poverty_Ratio",
                  "Number_of_hours_of_sleep","BODY_MEASURE_COMPOSITE","Age","Haemoglobin"]

# Z-Score method (check how many values > |3|)
from scipy.stats import zscore
outlier_report = {}
for col in numerical_cols:
    z_scores = zscore(df[col].dropna())
    outliers = (abs(z_scores) > 3).sum()
    outlier_report[col] = outliers

print("Outliers detected (Z-score > 3):")
for col, count in outlier_report.items():
    print(f"{col}: {count}")



Outliers detected (Z-score > 3):
Number_of_Vigorous_Physical_activities_per_week: 30
Number_of_Moderate_Physical_activities_per_week: 17
mean_steroid_ng_dl: 57
blood_macros: 107
HDL_mg: 72
WBC: 97
Platelete: 102
SystolicBP: 76
Household_Size: 0
Cholestrol_level: 68
Pulse: 61
DiastolicBP: 86
Family_poverty_level_index: 0
Income_to_Poverty_Ratio: 0
Number_of_hours_of_sleep: 176
BODY_MEASURE_COMPOSITE: 142
Age: 0
Haemoglobin: 108


In [17]:
#StandardScaler assumes normal-like distribution. Outliers will still pull the mean/std strongly.

# RobustScaler uses median and IQR → much better when outliers exist.


from sklearn.preprocessing import StandardScaler, RobustScaler

# Define ALL columns to scale (numerical + ordinal + binary)

# 1. All numerical continuous columns
numerical_cols = [
    "Number_of_Vigorous_Physical_activities_per_week",
    "Number_of_Moderate_Physical_activities_per_week",
    "mean_steroid_ng_dl", "blood_macros", "HDL_mg", "WBC", 
    "Platelete", "SystolicBP", "Household_Size", "Cholestrol_level",
    "Pulse", "DiastolicBP", "Family_poverty_level_index", 
    "Income_to_Poverty_Ratio", "Number_of_hours_of_sleep",
    "BODY_MEASURE_COMPOSITE", "Age", "Haemoglobin"
]

# 2. Ordinal encoded columns (now integers in df_encoded)
ordinal_cols_list = [
    "Age_Group", 
    "Education_Level", 
    "General_health_condition",
    "General_hearing_condition", 
    "functional_difficulty_composite",
    "Teeth_and_gum_health"
]

# 3. Binary encoded columns (now 0/1/2 in df_encoded)
binary_cols_list = [
    "Covered_by_health_insurance",
    "Had_alcohol_in_the_past",
    "Has_Kidney_Failure",
    "Had_high_blood_pressure",
    "Takes_vitamin_supplements",
    "Tested_for_HIV_Virus",
    "Has_diabetes",
    "Had_Asthma",
    "Had_Arthritis",
    "Had_heart_attack",
    "Had_Thyroid",
    "Had_Liver_COndition",
    "Had_Cancer",
    "Has_Hepatitis",
    "Has_Disability",
    "Received_Hepatitis_A_Vaccine"
]

# Combine all columns to scale
cols_to_scale = numerical_cols + ordinal_cols_list + binary_cols_list

# Filter only columns that exist in df_encoded
cols_to_scale = [col for col in cols_to_scale if col in df_encoded.columns]

print(f"Total columns to scale: {len(cols_to_scale)}")
print(f"  - Numerical continuous: {len([c for c in numerical_cols if c in df_encoded.columns])}")
print(f"  - Ordinal encoded: {len([c for c in ordinal_cols_list if c in df_encoded.columns])}")
print(f"  - Binary encoded: {len([c for c in binary_cols_list if c in df_encoded.columns])}")

# Identify one-hot encoded columns (will NOT be scaled)
all_cols = set(df_encoded.columns)
cols_to_scale_set = set(cols_to_scale)
ohe_cols = list(all_cols - cols_to_scale_set)

print(f"  - One-hot encoded (NOT scaled): {len(ohe_cols)}")

# Apply RobustScaler
scaler = RobustScaler()
df_scaled = df_encoded.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

print(f"\n✓ RobustScaler applied successfully!")
print(f"✓ Scaled {len(cols_to_scale)} features")
print(f"✓ One-hot encoded columns remain as 0/1")

Total columns to scale: 39
  - Numerical continuous: 18
  - Ordinal encoded: 6
  - Binary encoded: 15
  - One-hot encoded (NOT scaled): 14

✓ RobustScaler applied successfully!
✓ Scaled 39 features
✓ One-hot encoded columns remain as 0/1


In [18]:
df.head()

Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,Has_diabetes,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,5,1,1.740466,15.7,5.560682,43.0,4,1.609438,5.0,0,...,0,1,0,1,0,1,1,0,0,0
1,2,1,1.987874,15.2,5.402677,66.0,4,1.098612,5.0,0,...,0,0,0,1,1,0,1,0,0,0
2,2,0,1.902108,13.8,5.463832,44.0,2,2.079442,1.41,1,...,1,0,1,0,0,1,1,0,0,0
3,4,0,2.054124,13.9,5.581615,43.0,1,1.098612,0.63,0,...,0,0,0,1,0,1,0,1,0,0
4,4,0,2.014903,14.0,5.488938,65.0,2,1.098612,5.0,0,...,0,0,1,0,1,0,1,0,0,0


In [19]:
df.isna().sum()

General_hearing_condition                              0
Had_high_blood_pressure                                0
WBC                                                    0
Haemoglobin                                            0
Platelete                                              0
Age                                                    0
Education_Level                                        0
Household_Size                                         0
Income_to_Poverty_Ratio                                0
Has_diabetes                                           0
Takes_vitamin_supplements                              0
Has_Disability                                         0
HDL_mg                                                 0
Has_Hepatitis                                          0
Covered_by_health_insurance                            0
Tested_for_HIV_Virus                                   0
General_health_condition                               0
Received_Hepatitis_A_Vaccine   

In [20]:
# final check
for col in df_encoded.columns:
    print(f"Column: {col}")
    print(df_encoded[col].unique())
    print("-" * 50)

Column: General_hearing_condition
[5 2 4 3 1 0 6]
--------------------------------------------------
Column: Had_high_blood_pressure
[1 0 2]
--------------------------------------------------
Column: WBC
[1.74046617 1.98787435 1.90210753 2.05412373 2.01490302 1.87180218
 1.94591015 2.2300144  1.91692261 2.37024374 2.35137526 1.88706965
 1.85629799 2.02814825 2.39789527 1.97408103 1.70474809 2.2512918
 2.04122033 1.84054963 2.00148    2.3321439  1.60943791 2.00821403
 2.19722458 2.28238239 2.20827441 2.43361336 2.21920348 2.72785283
 2.17475172 1.93152141 2.07944154 2.27212589 2.16332303 1.79175947
 2.57261223 1.82454929 2.09186406 1.96009478 2.1517622  2.42480273
 2.66025954 2.55722731 2.54944517 1.77495235 2.12823171 1.75785792
 2.14006616 2.4510051  2.70136121 2.66722821 1.58923521 2.40694511
 1.7227666  2.10413415 2.68784749 2.50959926 2.06686276 1.64865863
 2.11625551 2.63188884 1.80828877 2.79728133 2.32238772 2.46809953
 2.44234704 1.48160454 2.77881927 2.24070969 1.5040774  2.58

In [21]:
df.to_csv('Data/df_scaled.csv')

In [22]:





# target_size = int(len(df_balanced) * 0.08)  # 8% of dataset

# # Oversample by Race
# race_dfs = []
# for race in df_balanced['Race_Ethnicity'].unique():
#     race_df = df_balanced[df_balanced['Race_Ethnicity'] == race].copy()
#     if len(race_df) < target_size:
#         race_df = race_df.sample(n=target_size, replace=True, random_state=42)
#     race_dfs.append(race_df)

# df_balanced = pd.concat(race_dfs, ignore_index=True)

# # Oversample by Education
# edu_dfs = []
# for edu in df_balanced['Education_Level'].unique():
#     edu_df = df_balanced[df_balanced['Education_Level'] == edu].copy()
#     if edu == 'Unknown/NA':
#         edu_dfs.append(edu_df)
#         continue
#     if len(edu_df) < target_size:
#         edu_df = edu_df.sample(n=target_size, replace=True, random_state=42)
#     edu_dfs.append(edu_df)

# df_balanced = pd.concat(edu_dfs, ignore_index=True).reset_index(drop=True)

# print(f"Dataset size: {len(df)} -> {len(df_balanced)}")

# # Step 2: Encode demographics for regression
# print("\nStep 2: Encoding demographics...")

# le_race = LabelEncoder()
# le_gender = LabelEncoder()
# le_age = LabelEncoder()

# df_balanced['Race_encoded'] = le_race.fit_transform(df_balanced['Race_Ethnicity'].astype(str))
# df_balanced['Gender_encoded'] = le_gender.fit_transform(df_balanced['Gender'].astype(str))
# df_balanced['Age_encoded'] = le_age.fit_transform(df_balanced['Age_Group'].astype(str))

# X_demographics = df_balanced[['Race_encoded', 'Gender_encoded', 'Age_encoded']]

# # Step 3: Correct continuous variables using regression
# print("\nStep 3: Correcting continuous variables...")

# continuous_vars = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
#                    'HDL_mg', 'WBC', 'Haemoglobin']

# for var in continuous_vars:
#     if var not in df_balanced.columns:
#         print(f"  ⚠️  {var} not found in dataset, skipping...")
#         continue
    
#     # Get non-missing data
#     mask = df_balanced[var].notna()
#     if mask.sum() < 10:
#         print(f"  ⚠️  {var} has too few valid values, skipping...")
#         continue
    
#     X = X_demographics[mask]
#     y = df_balanced.loc[mask, var].astype(float)
    
#     # Fit regression
#     model = LinearRegression()
#     model.fit(X, y)
    
#     # Get residuals
#     predictions = model.predict(X)
#     residuals = y - predictions
    
#     # Overwrite ONLY this column with residuals
#     df_balanced.loc[mask, var] = residuals
    
#     print(f"  ✓ {var} corrected in place (R² = {model.score(X, y):.3f})")

# # Step 4: Convert and correct binary variables
# print("\nStep 4: Converting and correcting binary outcomes...")

# binary_vars = ['Has_diabetes', 'Had_heart_attack', 'Had_Cancer', 'Had_Asthma']

# for var in binary_vars:
#     if var not in df_balanced.columns:
#         print(f"  ⚠️  {var} not found in dataset, skipping...")
#         continue
    
#     print(f"\n  Processing {var}...")
#     print(f"    Original unique values: {df_balanced[var].unique()}")
    
#     # FORCE convert to numeric - handle ALL possible values
#     value_map = {}
#     for val in df_balanced[var].unique():
#         if pd.isna(val):
#             continue
#         val_str = str(val).strip().lower()
#         if val_str == 'yes':
#             value_map[val] = 1.0
#         else:  # No, Borderline, Refused, Don't know, etc. all become 0
#             value_map[val] = 0.0
    
#     # Apply mapping
#     df_balanced[var] = df_balanced[var].map(value_map)
#     print(f"    Converted to numeric: {df_balanced[var].unique()}")
    
#     # Get non-missing data
#     mask = df_balanced[var].notna()
#     if mask.sum() < 10:
#         print(f"    ⚠️  Too few valid values, skipping...")
#         continue
    
#     X = X_demographics[mask]
#     y = df_balanced.loc[mask, var]
    
#     # Skip if only one class
#     if y.nunique() < 2:
#         print(f"    ⚠️  Only one class present, skipping...")
#         continue
    
#     print(f"    Class distribution: 0={(y==0).sum()}, 1={(y==1).sum()}")
    
#     # Fit logistic regression
#     model = LogisticRegression(max_iter=1000, random_state=42)
#     model.fit(X, y.astype(int))
    
#     # Get residuals
#     pred_probs = model.predict_proba(X)[:, 1]
#     residuals = y - pred_probs
    
#     # Overwrite ONLY this column with residuals
#     df_balanced.loc[mask, var] = residuals
    
#     print(f"    ✓ {var} corrected in place")

# # Step 5: Handle General_health_condition encoding if needed
# print("\nStep 5: Encoding General_health_condition...")

# if 'General_health_condition' in df_balanced.columns:
#     if df_balanced['General_health_condition'].dtype == 'object':
#         print(f"  General_health_condition unique values: {df_balanced['General_health_condition'].unique()}")
#         health_map = {'Excellent': 0, 'Very good': 1, 'Good': 2, 
#                       'Fair': 3, 'Poor': 4, "Don't know": 2}
#         df_balanced['General_health_condition'] = df_balanced['General_health_condition'].map(health_map)

# print("\n" + "="*80)
# print("BIAS CORRECTION COMPLETE")
# print("="*80)
# print(f"Total columns in df_balanced: {len(df_balanced.columns)}")
# print(f"Modified columns: {continuous_vars + binary_vars + ['General_health_condition']}")
# print(f"All other columns remain unchanged")
# print("="*80)

# # This is your final dataframe with all columns
# df_final = df_balanced

In [23]:
# df = df_final.copy()

In [24]:
# from scipy import stats

# print("="*80)
# print("BIAS VERIFICATION AFTER CORRECTION")
# print("="*80)

# # Check continuous variables - means should be near 0 for all groups
# continuous_vars = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
#                    'HDL_mg', 'WBC', 'Haemoglobin']

# print("\nCONTINUOUS VARIABLES - Mean by Race (should all be ~0):")
# print("-"*80)
# for var in continuous_vars:
#     print(f"\n{var}:")
#     race_means = df_final.groupby('Race_Ethnicity')[var].mean()
#     print(race_means)
    
#     # ANOVA test
#     groups = [df_final[df_final['Race_Ethnicity']==race][var].dropna() 
#               for race in df_final['Race_Ethnicity'].unique()]
#     f_stat, p_val = stats.f_oneway(*groups)
    
#     if p_val > 0.05:
#         print(f"✓ No significant bias (p={p_val:.4f})")
#     else:
#         print(f"⚠️ Some bias remains (p={p_val:.4f})")

# # Check binary variables - means should be near 0 for all groups
# binary_vars = ['Has_diabetes', 'Had_heart_attack', 'Had_Cancer', 'Had_Asthma']

# print("\n" + "="*80)
# print("BINARY VARIABLES (RESIDUALS) - Mean by Race (should all be ~0):")
# print("-"*80)
# for var in binary_vars:
#     print(f"\n{var}:")
#     race_means = df_final.groupby('Race_Ethnicity')[var].mean()
#     print(race_means)
    
#     # ANOVA test
#     groups = [df_final[df_final['Race_Ethnicity']==race][var].dropna() 
#               for race in df_final['Race_Ethnicity'].unique()]
#     f_stat, p_val = stats.f_oneway(*groups)
    
#     if p_val > 0.05:
#         print(f"✓ No significant bias (p={p_val:.4f})")
#     else:
#         print(f"⚠️ Some bias remains (p={p_val:.4f})")