### Bias Detection and Handling

In [301]:
# import pandas as pd
# df = pd.read_pickle('Data/df_cleaned.pkl')

import sys
import numpy as np
import types
import pandas as pd

df = pd.read_csv('Data/df_cleaned.csv', index_col=0)


In [302]:
df.head()

Unnamed: 0,Had_alcohol_in_the_past,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Gender,Age,Race_Ethnicity,Country_of_Birth,...,Number_of_hours_of_sleep,Cholestrol_level,SystolicBP,DiastolicBP,Pulse,BODY_MEASURE_COMPOSITE,blood_macros,mean_steroid_ng_dl,functional_difficulty_composite,Age_Group
0,Yes,Excellent,Yes,4.7,15.7,259.0,Male,43.0,Other Race - Including Multi-Racial,Unknown/NA,...,9.5,264.0,132.666667,96.0,81.0,64.34,50.31675,155.841667,No difficulty,31-45
1,Yes,Moderate hearing trouble,Yes,6.3,15.2,221.0,Male,66.0,Non-Hispanic White,Born in 50 US states or Washington,...,9.0,214.0,117.0,78.666667,72.0,67.6,52.49825,104.99,Some difficulty,61-75
2,Yes,Moderate hearing trouble,No,5.7,13.8,235.0,Female,44.0,Other Hispanic,Unknown/NA,...,8.0,187.0,109.0,78.333333,81.333333,60.36,43.3125,135.308333,A lot of difficulty,31-45
3,Yes,Good,No,6.8,13.9,264.5,Male,43.0,Mexican American,Unknown/NA,...,7.5,186.0,113.666667,74.333333,72.0,62.64,47.543875,103.591667,Some difficulty,31-45
4,Yes,Good,No,6.5,14.0,241.0,Female,65.0,Non-Hispanic White,Born in 50 US states or Washington,...,8.0,188.0,125.666667,74.0,69.333333,63.1,47.159,40.313333,No difficulty,61-75


Check 1: Demographic Representation (Visualization)
Purpose: Ensure no group is severely underrepresented

In [303]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Assuming your dataframe is named 'df'
# df = pd.read_csv('your_nhanes_data.csv')

def analyze_representation_bias(df):
    """
    Analyze demographic representation to identify potential sampling bias
    """
    print("=" * 80)
    print("1. REPRESENTATION BIAS ANALYSIS")
    print("=" * 80)
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group', 'Education_Level', 
                        'Marital_Status', 'Country_of_Birth']
    
    for col in demographic_cols:
        if col in df.columns:
            print(f"\n{col} Distribution:")
            print("-" * 40)
            counts = df[col].value_counts(dropna=False)
            percentages = df[col].value_counts(normalize=True, dropna=False) * 100
            
            result_df = pd.DataFrame({
                'Count': counts,
                'Percentage': percentages.round(2)
            })
            print(result_df)
            
            # Check for severe underrepresentation (< 5%)
            underrepresented = percentages[percentages < 5]
            if len(underrepresented) > 0:
                print(f"\n⚠️  WARNING: Underrepresented groups (< 5%):")
                for group, pct in underrepresented.items():
                    print(f"   - {group}: {pct:.2f}%")


def analyze_outcome_disparities(df):
    """
    Analyze health outcome disparities across demographic groups
    """
    print("\n" + "=" * 80)
    print("3. HEALTH OUTCOME DISPARITY ANALYSIS")
    print("=" * 80)
    
    # Conditions to analyze
    conditions = ['Has_diabetes', 'Has_high_blood_pressure', 'Had_heart_attack', 
                  'Had_Cancer', 'Had_Asthma', 'General_health_condition']
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group']
    
    for condition in conditions:
        if condition not in df.columns:
            continue
            
        print(f"\n{condition}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            if condition == 'General_health_condition':
                # For ordinal health condition, calculate mean rating
                health_order = ['Excellent', 'Very good', 'Good', 'Fair', 'Poor']
                temp_df = df[df[condition].isin(health_order)].copy()
                temp_df['health_score'] = temp_df[condition].map(
                    {h: i for i, h in enumerate(health_order)}
                )
                by_group = temp_df.groupby(demo_col)['health_score'].mean()
                print(f"\n  {demo_col} (lower score = better health):")
                print(f"  {by_group.round(2).to_dict()}")
            else:
                # For binary conditions
                by_group = df.groupby(demo_col)[condition].apply(
                    lambda x: (x == 'Yes').sum() / x.notna().sum() * 100
                )
                print(f"\n  {demo_col} (% with condition):")
                print(f"  {by_group.round(2).to_dict()}")


def analyze_continuous_health_metrics(df):
    """
    Analyze continuous health metrics for disparities
    """
    print("\n" + "=" * 80)
    print("4. CONTINUOUS HEALTH METRICS ANALYSIS")
    print("=" * 80)
    
    metrics = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
               'HDL_mg', 'WBC', 'Haemoglobin']
    demographic_cols = ['Gender', 'Race_Ethnicity']
    
    for metric in metrics:
        if metric not in df.columns:
            continue
            
        print(f"\n{metric}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            print(f"\n  By {demo_col}:")
            by_group = df.groupby(demo_col)[metric].agg(['mean', 'std', 'count'])
            print(f"  {by_group.round(2)}")
            
            # Perform statistical test (ANOVA)
            groups = [group[metric].dropna() for name, group in df.groupby(demo_col)]
            if len(groups) > 1 and all(len(g) > 0 for g in groups):
                f_stat, p_value = stats.f_oneway(*groups)
                if p_value < 0.05:
                    print(f"  ⚠️  Significant difference detected (p={p_value:.4f})")


def analyze_socioeconomic_bias(df):
    """
    Analyze potential socioeconomic biases
    """
    print("\n" + "=" * 80)
    print("5. SOCIOECONOMIC BIAS ANALYSIS")
    print("=" * 80)
    
    if 'Income_to_Poverty_Ratio' in df.columns:
        # Create income categories
        df_temp = df.copy()
        df_temp['Income_Category'] = pd.cut(
            df_temp['Income_to_Poverty_Ratio'],
            bins=[0, 1.3, 3.5, 10],
            labels=['Below Poverty', 'Low Income', 'Above Low Income']
        )
        
        print("\nIncome Category Distribution:")
        print(df_temp['Income_Category'].value_counts(normalize=True) * 100)
        
        # Health insurance coverage by income
        if 'Covered_by_health_insurance' in df.columns:
            print("\nHealth Insurance Coverage by Income:")
            coverage = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['Covered_by_health_insurance'],
                normalize='index'
            ) * 100
            print(coverage.round(2))
        
        # Health outcomes by income
        if 'General_health_condition' in df.columns:
            print("\nGeneral Health by Income:")
            health = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['General_health_condition'],
                normalize='index'
            ) * 100
            print(health.round(2))
    


# Run the analysis
# generate_bias_report(df)

In [304]:
analyze_representation_bias(df)
analyze_outcome_disparities(df)
analyze_continuous_health_metrics(df)
analyze_socioeconomic_bias(df)

1. REPRESENTATION BIAS ANALYSIS

Gender Distribution:
----------------------------------------
        Count  Percentage
Gender                   
Female   5051       54.59
Male     4202       45.41

Race_Ethnicity Distribution:
----------------------------------------
                                     Count  Percentage
Race_Ethnicity                                        
Non-Hispanic White                    5133       55.47
Non-Hispanic Black                    1199       12.96
Other Race - Including Multi-Racial   1153       12.46
Other Hispanic                        1002       10.83
Mexican American                       766        8.28

Age_Group Distribution:
----------------------------------------
           Count  Percentage
Age_Group                   
61-75       2513       27.16
31-45       1755       18.97
46-60       1637       17.69
0-18        1187       12.83
19-30       1172       12.67
76+          989       10.69

Education_Level Distribution:
----------------

In [305]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder

# ============================================================================
# SIMPLE BIAS CORRECTION (KEEPS ALL OTHER COLUMNS INTACT)
# ============================================================================


# Step 1: Oversample underrepresented groups
print("Step 1: Oversampling underrepresented groups...")

#Stratified Subsampling 
  # Creates a balanced subset by reducing majority groups instead of inflating minority.
target_size = df.groupby(["Race_Ethnicity", "Education_Level"]).size().min()

df_balanced = (
    df.groupby(["Race_Ethnicity", "Education_Level"], group_keys=False)
      .apply(lambda x: x.sample(n=target_size, random_state=42))
      .reset_index(drop=True)
)

# analyze_representation_bias(df_balanced)
df = df_balanced
print(df.shape)
df.head()

Step 1: Oversampling underrepresented groups...
(1020, 44)


Unnamed: 0,Had_alcohol_in_the_past,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Gender,Age,Race_Ethnicity,Country_of_Birth,...,Number_of_hours_of_sleep,Cholestrol_level,SystolicBP,DiastolicBP,Pulse,BODY_MEASURE_COMPOSITE,blood_macros,mean_steroid_ng_dl,functional_difficulty_composite,Age_Group
0,Yes,Good,No,8.0,12.4,278.0,Female,50.0,Mexican American,Unknown/NA,...,5.0,172.0,125.0,77.0,71.0,72.7,45.36225,160.626667,A lot of difficulty,46-60
1,Yes,Good,No,6.8,13.9,264.5,Male,43.0,Mexican American,Unknown/NA,...,7.5,186.0,113.666667,74.333333,72.0,62.64,47.543875,103.591667,Some difficulty,31-45
2,Yes,Good,No,7.6,17.2,231.0,Male,34.0,Mexican American,Born in 50 US states or Washington,...,7.5,186.0,133.333333,94.666667,107.666667,61.32,57.05475,158.71,A lot of difficulty,31-45
3,Yes,Excellent,No,8.0,15.4,251.0,Male,36.0,Mexican American,Born in 50 US states or Washington,...,6.0,185.0,112.333333,69.666667,59.0,60.66,50.26925,88.526667,No difficulty,31-45
4,No,Excellent,No,8.5,14.3,271.0,Female,43.0,Mexican American,Unknown/NA,...,8.0,173.0,118.333333,79.666667,81.333333,66.66,46.36025,54.393333,A lot of difficulty,31-45


## Encoding

In [306]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder


# Nominal categorical → One-Hot
ohe_cols = [
    "Race_Ethnicity",
    "Gender",
    "Country_of_Birth",
    "Marital_Status"
]

# Binary categorical → map to 0/1/2 (Unknown as 2)
binary_cols = [
    "Covered_by_health_insurance",
    "Had_alcohol_in_the_past",
    "Takes_vitamin_supplements",
    "Tested_for_HIV_Virus",
    "Has_diabetes",
    "Had_Asthma",
    "Had_Arthritis",
    "Had_heart_attack",
    "Had_Thyroid",
    "Had_Liver_COndition",
    "Had_Cancer",
    "Has_Hepatitis",
    "Has_Disability"
]

# Ordinal categorical → ordered encoding + Unknown at the end
ordinal_cols = {
    "Age_Group": [["0-18", "19-30", "31-45", "46-60", "61-75", "76+", "Unknown"]],
    "Education_Level": [["Less than 9th grade",
                         "9-11th grade (Includes 12th grade with no diploma)",
                         "High school graduate/GED or equivalent",
                         "Some college or AA degree",
                         "College graduate or above",
                         "Unknown"]],
    "General_health_condition": [["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"]],
    "General_hearing_condition": [["Deaf", "A lot of trouble", "Moderate hearing trouble",
                                   "A little trouble", "Good", "Excellent", "Unknown"]],
    "Teeth_and_gum_health": [["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"]]
}

# --------------------------
# Encoding functions
# --------------------------

def encode_dataset(df):
    df_encoded = df.copy()

    # Normalize all Unknown-like responses into "Unknown"
    df_encoded = df_encoded.replace(
        {"Don't know": "Unknown", "Refused": "Unknown", "Not Applicable": "Unknown", "N/A": "Unknown", "Unknown/NA": "Unknown"}
    )

    # Binary encoding: map Yes/No/Unknown
    for col in binary_cols:
        if col == "Has_diabetes":
            df_encoded[col] = df_encoded[col].map({"No": 0, "Yes": 1, "Borderline": 2, "Unknown": 2})
        else:
            df_encoded[col] = df_encoded[col].map({"No": 0, "Yes": 1, "Unknown": 2})

    # Ordinal encoding
    for col, categories in ordinal_cols.items():
        if col in df_encoded.columns:
            encoder = OrdinalEncoder(categories=categories)
            df_encoded[col] = encoder.fit_transform(df_encoded[[col]])

    # One-Hot encoding
    df_encoded = pd.get_dummies(df_encoded, columns=[col for col in ohe_cols if col in df_encoded.columns], drop_first=False)

    return df_encoded

# --------------------------
# Usage
# --------------------------

df_encoded = encode_dataset(df)

print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)
df_encoded.head()


Original shape: (1020, 44)
Encoded shape: (1020, 53)


Unnamed: 0,Had_alcohol_in_the_past,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,1,4.0,No,8.0,12.4,278.0,50.0,1.0,5.0,1.14,...,False,False,True,False,False,True,True,False,False,False
1,1,4.0,No,6.8,13.9,264.5,43.0,1.0,2.0,0.63,...,False,False,False,True,False,True,False,True,False,False
2,1,4.0,No,7.6,17.2,231.0,34.0,1.0,3.0,2.74,...,False,False,False,True,True,False,False,True,False,False
3,1,5.0,No,8.0,15.4,251.0,36.0,1.0,4.0,4.22,...,False,False,False,True,True,False,True,False,False,False
4,0,5.0,No,8.5,14.3,271.0,43.0,1.0,5.0,1.23,...,False,False,True,False,False,True,True,False,False,False


In [307]:
# verify if encoding is sucessful

for col in binary_cols:
    if col in df_encoded.columns:
        print(col, df_encoded[col].unique())


for col in ordinal_cols.keys():
    if col in df_encoded.columns:
        print(col, df_encoded[col].unique())


[col for col in df_encoded.columns if any(base in col for base in ohe_cols)]

print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)


df = df_encoded

Covered_by_health_insurance [1 0 2]
Had_alcohol_in_the_past [1 0 2]
Takes_vitamin_supplements [0 1]
Tested_for_HIV_Virus [0 1 2]
Has_diabetes [0 2 1]
Had_Asthma [0 1]
Had_Arthritis [0 1 2]
Had_heart_attack [0 2 1]
Had_Thyroid [0 1 2]
Had_Liver_COndition [0 1 2]
Had_Cancer [0 1 2]
Has_Hepatitis [0 1 2]
Has_Disability [0 1 2]
Age_Group [3. 2. 4. 1. 5. 0.]
Education_Level [1. 4. 2. 0. 3. 5.]
General_health_condition [2. 4. 3. 1. 0. 5.]
General_hearing_condition [4. 5. 3. 1. 2. 0.]
Teeth_and_gum_health [0. 1. 3. 2. 4. 5.]
Original shape: (1020, 44)
Encoded shape: (1020, 53)


In [308]:

numerical_cols = df.select_dtypes(include=['int64','float64']).columns

# Skewness
print("\n=== Skewness of Numerical Columns ===")
print(df[numerical_cols].skew().sort_values(ascending=False))



=== Skewness of Numerical Columns ===
Number_of_Moderate_Physical_activities_per_week    18.713517
Number_of_Vigorous_Physical_activities_per_week    13.071211
Takes_vitamin_supplements                           8.371207
blood_macros                                        8.117693
mean_steroid_ng_dl                                  7.658309
Has_Hepatitis                                       7.506328
Has_diabetes                                        2.422068
Had_heart_attack                                    1.654584
Had_Liver_COndition                                 1.556247
Had_Asthma                                          1.502210
Had_Cancer                                          1.426541
Had_Thyroid                                         1.424948
Tested_for_HIV_Virus                                1.137184
WBC                                                 1.133834
SystolicBP                                          1.050028
Has_Disability                                

In [309]:
# Columns to log-transform
# other columns above 1 are binary columns
skewed_cols = [
    "Number_of_Moderate_Physical_activities_per_week",
    "Number_of_Vigorous_Physical_activities_per_week",
    "blood_macros",
    "mean_steroid_ng_dl",
    "WBC"
]

# Apply log1p safely (handles zeros)
for col in skewed_cols:
    if col in df.columns:
        df[col] = np.log1p(df[col].clip(lower=0))

## Scaling

In [310]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select numerical (continuous) columns
numerical_cols = df.select_dtypes(include=['int64','float64']).columns

# Skip categorical encodings (binary/ordinal)
encoded_categoricals = [
    "Had_alcohol_in_the_past", "Covered_by_health_insurance",
    "Takes_vitamin_supplements", "Tested_for_HIV_Virus", "Has_diabetes",
    "Had_Asthma", "Had_Arthritis", "Had_heart_attack", "Had_Thyroid",
    "Had_Liver_COndition", "Had_Cancer", "Has_Hepatitis", "Has_Disability",
    "Education_Level", "Age_Group", "General_health_condition",
    "General_hearing_condition", "Teeth_and_gum_health"
]

continuous_cols = [col for col in numerical_cols if col not in encoded_categoricals]

# Z-Score method (check how many values > |3|)
from scipy.stats import zscore
outlier_report = {}
for col in continuous_cols:
    z_scores = zscore(df[col].dropna())
    outliers = (abs(z_scores) > 3).sum()
    outlier_report[col] = outliers

print("Outliers detected (Z-score > 3):")
for col, count in outlier_report.items():
    print(f"{col}: {count}")



Outliers detected (Z-score > 3):
WBC: 13
Haemoglobin: 10
Platelete: 14
Age: 0
Household_Size: 0
Income_to_Poverty_Ratio: 0
HDL_mg: 12
Family_poverty_level_index: 0
Number_of_Moderate_Physical_activities_per_week: 4
Number_of_Vigorous_Physical_activities_per_week: 8
Number_of_hours_of_sleep: 17
Cholestrol_level: 7
SystolicBP: 14
DiastolicBP: 8
Pulse: 7
BODY_MEASURE_COMPOSITE: 13
blood_macros: 11
mean_steroid_ng_dl: 6


In [311]:
#StandardScaler assumes normal-like distribution. Outliers will still pull the mean/std strongly.

# RobustScaler uses median and IQR → much better when outliers exist.


from sklearn.preprocessing import StandardScaler, RobustScaler

# Continuous columns where outliers dominate
robust_cols = [
    "WBC", "Haemoglobin", "Platelete", "HDL_mg",
    "Number_of_hours_of_sleep",
    "SystolicBP", "DiastolicBP", "Pulse",
    "BODY_MEASURE_COMPOSITE",
    "blood_macros", "mean_steroid_ng_dl"
]

# Well-behaved continuous columns → StandardScaler
standard_cols = [
    "Age", "Household_Size", "Income_to_Poverty_Ratio",
    "Cholestrol_level", "Family_poverty_level_index"
]


df_scaled = df.copy()

print("Before Scaling:")
print(df_scaled[robust_cols + standard_cols].describe().T)

# Robust scaling
robust_scaler = RobustScaler()
df_scaled[robust_cols] = robust_scaler.fit_transform(df_scaled[robust_cols])

# Standard scaling
standard_scaler = StandardScaler()
df_scaled[standard_cols] = standard_scaler.fit_transform(df_scaled[standard_cols])

print("✅ Scaling complete")
print("RobustScaler applied to:", robust_cols)
print("StandardScaler applied to:", standard_cols)

print("After Scaling:")
print(df_scaled[robust_cols + standard_cols].describe().T)
df = df_scaled


Before Scaling:
                             count        mean        std           min  \
WBC                         1020.0    2.025529   0.215873  1.280934e+00   
Haemoglobin                 1020.0   13.639706   1.354186  6.300000e+00   
Platelete                   1020.0  259.366176  64.137250  6.100000e+01   
HDL_mg                      1020.0   52.580392  11.014684  2.300000e+01   
Number_of_hours_of_sleep    1020.0    7.782843   1.715870  2.000000e+00   
SystolicBP                  1020.0  119.701634  16.436586  7.966667e+01   
DiastolicBP                 1020.0   72.272222  10.358774  3.400000e+01   
Pulse                       1020.0   72.542810  10.510675  3.900000e+01   
BODY_MEASURE_COMPOSITE      1020.0   60.092462   9.487420  2.715000e+01   
blood_macros                1020.0    3.867311   0.122732  3.446274e+00   
mean_steroid_ng_dl          1020.0    4.142690   0.829150  8.707981e-01   
Age                         1020.0   44.938235  22.472731  6.000000e+00   
Household

In [None]:
# adding one more final standarscaler to whole dataset before this unfies the scaling used and PCA, clustering will treat al features fairly

all_numeric = df_scaled.select_dtypes(include=['int64','float64']).columns

final_scaler = StandardScaler()
df_scaled[all_numeric] = final_scaler.fit_transform(df_scaled[all_numeric])

print("Final unified scaling applied to all numeric features")
print(df_scaled[all_numeric].describe().T.head())

✅ Final unified scaling applied to all numeric features
                            count          mean       std       min       25%  \
Had_alcohol_in_the_past    1020.0  2.507798e-16  1.000491 -2.248894 -0.186060   
General_hearing_condition  1020.0  1.097162e-16  1.000491 -3.765821 -0.039275   
WBC                        1020.0 -1.741526e-18  1.000491 -3.450914 -0.503348   
Haemoglobin                1020.0  2.438137e-17  1.000491 -5.422673 -0.546505   
Platelete                  1020.0 -3.483053e-18  1.000491 -3.094356 -0.567284   

                                50%       75%       max  
Had_alcohol_in_the_past   -0.186060 -0.186060  1.876775  
General_hearing_condition -0.039275  0.892362  0.892362  
WBC                        0.012141  0.530838  4.135203  
Haemoglobin                0.192309  0.413953  3.295325  
Platelete                 -0.083708  0.372569  5.298020  


In [313]:
df.head()

Unnamed: 0,Had_alcohol_in_the_past,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,-0.18606,-0.039275,No,0.795746,-0.915911,0.290673,0.225351,-0.87831,1.060371,-0.835265,...,False,False,True,False,False,True,True,False,False,False
1,-0.18606,-0.039275,No,0.132528,0.192309,0.080084,-0.086291,-0.87831,-0.681301,-1.18325,...,False,False,False,True,False,True,False,True,False,False
2,-0.18606,-0.039275,No,0.585045,2.630393,-0.44249,-0.486972,-0.87831,-0.100744,0.256453,...,False,False,False,True,True,False,False,True,False,False
3,-0.18606,0.892362,No,0.795746,1.300529,-0.130506,-0.397932,-0.87831,0.479814,1.266293,...,False,False,False,True,True,False,True,False,False,False
4,-2.248894,0.892362,No,1.046327,0.487834,0.181478,-0.086291,-0.87831,1.060371,-0.773856,...,False,False,True,False,False,True,True,False,False,False


In [314]:
bool_cols = df.select_dtypes(include=['bool']).columns
df[bool_cols] = df[bool_cols].astype(int)
df["Had_high_blood_pressure"] = df["Had_high_blood_pressure"].replace({
        "No": 0,
        "Yes": 1,
        "Unknown": 2
    })

df.head()

Unnamed: 0,Had_alcohol_in_the_past,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,-0.18606,-0.039275,0,0.795746,-0.915911,0.290673,0.225351,-0.87831,1.060371,-0.835265,...,0,0,1,0,0,1,1,0,0,0
1,-0.18606,-0.039275,0,0.132528,0.192309,0.080084,-0.086291,-0.87831,-0.681301,-1.18325,...,0,0,0,1,0,1,0,1,0,0
2,-0.18606,-0.039275,0,0.585045,2.630393,-0.44249,-0.486972,-0.87831,-0.100744,0.256453,...,0,0,0,1,1,0,0,1,0,0
3,-0.18606,0.892362,0,0.795746,1.300529,-0.130506,-0.397932,-0.87831,0.479814,1.266293,...,0,0,0,1,1,0,1,0,0,0
4,-2.248894,0.892362,0,1.046327,0.487834,0.181478,-0.086291,-0.87831,1.060371,-0.773856,...,0,0,1,0,0,1,1,0,0,0


In [315]:





# target_size = int(len(df_balanced) * 0.08)  # 8% of dataset

# # Oversample by Race
# race_dfs = []
# for race in df_balanced['Race_Ethnicity'].unique():
#     race_df = df_balanced[df_balanced['Race_Ethnicity'] == race].copy()
#     if len(race_df) < target_size:
#         race_df = race_df.sample(n=target_size, replace=True, random_state=42)
#     race_dfs.append(race_df)

# df_balanced = pd.concat(race_dfs, ignore_index=True)

# # Oversample by Education
# edu_dfs = []
# for edu in df_balanced['Education_Level'].unique():
#     edu_df = df_balanced[df_balanced['Education_Level'] == edu].copy()
#     if edu == 'Unknown/NA':
#         edu_dfs.append(edu_df)
#         continue
#     if len(edu_df) < target_size:
#         edu_df = edu_df.sample(n=target_size, replace=True, random_state=42)
#     edu_dfs.append(edu_df)

# df_balanced = pd.concat(edu_dfs, ignore_index=True).reset_index(drop=True)

# print(f"Dataset size: {len(df)} -> {len(df_balanced)}")

# # Step 2: Encode demographics for regression
# print("\nStep 2: Encoding demographics...")

# le_race = LabelEncoder()
# le_gender = LabelEncoder()
# le_age = LabelEncoder()

# df_balanced['Race_encoded'] = le_race.fit_transform(df_balanced['Race_Ethnicity'].astype(str))
# df_balanced['Gender_encoded'] = le_gender.fit_transform(df_balanced['Gender'].astype(str))
# df_balanced['Age_encoded'] = le_age.fit_transform(df_balanced['Age_Group'].astype(str))

# X_demographics = df_balanced[['Race_encoded', 'Gender_encoded', 'Age_encoded']]

# # Step 3: Correct continuous variables using regression
# print("\nStep 3: Correcting continuous variables...")

# continuous_vars = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
#                    'HDL_mg', 'WBC', 'Haemoglobin']

# for var in continuous_vars:
#     if var not in df_balanced.columns:
#         print(f"  ⚠️  {var} not found in dataset, skipping...")
#         continue
    
#     # Get non-missing data
#     mask = df_balanced[var].notna()
#     if mask.sum() < 10:
#         print(f"  ⚠️  {var} has too few valid values, skipping...")
#         continue
    
#     X = X_demographics[mask]
#     y = df_balanced.loc[mask, var].astype(float)
    
#     # Fit regression
#     model = LinearRegression()
#     model.fit(X, y)
    
#     # Get residuals
#     predictions = model.predict(X)
#     residuals = y - predictions
    
#     # Overwrite ONLY this column with residuals
#     df_balanced.loc[mask, var] = residuals
    
#     print(f"  ✓ {var} corrected in place (R² = {model.score(X, y):.3f})")

# # Step 4: Convert and correct binary variables
# print("\nStep 4: Converting and correcting binary outcomes...")

# binary_vars = ['Has_diabetes', 'Had_heart_attack', 'Had_Cancer', 'Had_Asthma']

# for var in binary_vars:
#     if var not in df_balanced.columns:
#         print(f"  ⚠️  {var} not found in dataset, skipping...")
#         continue
    
#     print(f"\n  Processing {var}...")
#     print(f"    Original unique values: {df_balanced[var].unique()}")
    
#     # FORCE convert to numeric - handle ALL possible values
#     value_map = {}
#     for val in df_balanced[var].unique():
#         if pd.isna(val):
#             continue
#         val_str = str(val).strip().lower()
#         if val_str == 'yes':
#             value_map[val] = 1.0
#         else:  # No, Borderline, Refused, Don't know, etc. all become 0
#             value_map[val] = 0.0
    
#     # Apply mapping
#     df_balanced[var] = df_balanced[var].map(value_map)
#     print(f"    Converted to numeric: {df_balanced[var].unique()}")
    
#     # Get non-missing data
#     mask = df_balanced[var].notna()
#     if mask.sum() < 10:
#         print(f"    ⚠️  Too few valid values, skipping...")
#         continue
    
#     X = X_demographics[mask]
#     y = df_balanced.loc[mask, var]
    
#     # Skip if only one class
#     if y.nunique() < 2:
#         print(f"    ⚠️  Only one class present, skipping...")
#         continue
    
#     print(f"    Class distribution: 0={(y==0).sum()}, 1={(y==1).sum()}")
    
#     # Fit logistic regression
#     model = LogisticRegression(max_iter=1000, random_state=42)
#     model.fit(X, y.astype(int))
    
#     # Get residuals
#     pred_probs = model.predict_proba(X)[:, 1]
#     residuals = y - pred_probs
    
#     # Overwrite ONLY this column with residuals
#     df_balanced.loc[mask, var] = residuals
    
#     print(f"    ✓ {var} corrected in place")

# # Step 5: Handle General_health_condition encoding if needed
# print("\nStep 5: Encoding General_health_condition...")

# if 'General_health_condition' in df_balanced.columns:
#     if df_balanced['General_health_condition'].dtype == 'object':
#         print(f"  General_health_condition unique values: {df_balanced['General_health_condition'].unique()}")
#         health_map = {'Excellent': 0, 'Very good': 1, 'Good': 2, 
#                       'Fair': 3, 'Poor': 4, "Don't know": 2}
#         df_balanced['General_health_condition'] = df_balanced['General_health_condition'].map(health_map)

# print("\n" + "="*80)
# print("BIAS CORRECTION COMPLETE")
# print("="*80)
# print(f"Total columns in df_balanced: {len(df_balanced.columns)}")
# print(f"Modified columns: {continuous_vars + binary_vars + ['General_health_condition']}")
# print(f"All other columns remain unchanged")
# print("="*80)

# # This is your final dataframe with all columns
# df_final = df_balanced

In [316]:
# df = df_final.copy()

In [317]:
# from scipy import stats

# print("="*80)
# print("BIAS VERIFICATION AFTER CORRECTION")
# print("="*80)

# # Check continuous variables - means should be near 0 for all groups
# continuous_vars = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
#                    'HDL_mg', 'WBC', 'Haemoglobin']

# print("\nCONTINUOUS VARIABLES - Mean by Race (should all be ~0):")
# print("-"*80)
# for var in continuous_vars:
#     print(f"\n{var}:")
#     race_means = df_final.groupby('Race_Ethnicity')[var].mean()
#     print(race_means)
    
#     # ANOVA test
#     groups = [df_final[df_final['Race_Ethnicity']==race][var].dropna() 
#               for race in df_final['Race_Ethnicity'].unique()]
#     f_stat, p_val = stats.f_oneway(*groups)
    
#     if p_val > 0.05:
#         print(f"✓ No significant bias (p={p_val:.4f})")
#     else:
#         print(f"⚠️ Some bias remains (p={p_val:.4f})")

# # Check binary variables - means should be near 0 for all groups
# binary_vars = ['Has_diabetes', 'Had_heart_attack', 'Had_Cancer', 'Had_Asthma']

# print("\n" + "="*80)
# print("BINARY VARIABLES (RESIDUALS) - Mean by Race (should all be ~0):")
# print("-"*80)
# for var in binary_vars:
#     print(f"\n{var}:")
#     race_means = df_final.groupby('Race_Ethnicity')[var].mean()
#     print(race_means)
    
#     # ANOVA test
#     groups = [df_final[df_final['Race_Ethnicity']==race][var].dropna() 
#               for race in df_final['Race_Ethnicity'].unique()]
#     f_stat, p_val = stats.f_oneway(*groups)
    
#     if p_val > 0.05:
#         print(f"✓ No significant bias (p={p_val:.4f})")
#     else:
#         print(f"⚠️ Some bias remains (p={p_val:.4f})")