### Bias Detection and Handling

In [1]:
# import pandas as pd
# df = pd.read_pickle('Data/df_cleaned.pkl')

import sys
import numpy as np
import types
import pandas as pd

df = pd.read_csv('Data/df_cleaned_v2.csv', index_col=0)


In [2]:
df.shape

(9757, 47)

In [3]:
df.drop(columns='zero_count',inplace=True)

In [4]:
df.isna().sum()

General_hearing_condition                          0
Had_high_blood_pressure                            0
WBC                                                0
Haemoglobin                                        0
Platelete                                          0
Gender                                             0
Age                                                0
Race_Ethnicity                                     0
Country_of_Birth                                   0
Education_Level                                    0
Marital_Status                                     0
Household_Size                                     0
Income_to_Poverty_Ratio                            0
Has_diabetes                                       0
Takes_vitamin_supplements                          0
Has_Disability                                     0
HDL_mg                                             0
Has_Hepatitis                                      0
Covered_by_health_insurance                   

Check 1: Demographic Representation (Visualization)
Purpose: Ensure no group is severely underrepresented

In [5]:
df['functional_difficulty_composite'].unique()

array(['No difficulty', 'Some difficulty', 'A lot of difficulty',
       'Very Severe Difficulty'], dtype=object)

In [6]:
df.columns

Index(['General_hearing_condition', 'Had_high_blood_pressure', 'WBC',
       'Haemoglobin', 'Platelete', 'Gender', 'Age', 'Race_Ethnicity',
       'Country_of_Birth', 'Education_Level', 'Marital_Status',
       'Household_Size', 'Income_to_Poverty_Ratio', 'Has_diabetes',
       'Takes_vitamin_supplements', 'Has_Disability', 'HDL_mg',
       'Has_Hepatitis', 'Covered_by_health_insurance', 'Tested_for_HIV_Virus',
       'General_health_condition', 'Received_Hepatitis_A_Vaccine',
       'Family_poverty_level_index', 'Has_Kidney_Failure', 'Had_Asthma',
       'Had_Arthritis', 'Had_heart_attack', 'Had_Thyroid',
       'Had_Liver_COndition', 'Had_Cancer', 'Teeth_and_gum_health',
       'Number_of_Moderate_Physical_activities_per_week',
       'Number_of_Vigorous_Physical_activities_per_week',
       'Number_of_hours_of_sleep', 'Cholestrol_level', 'SystolicBP',
       'DiastolicBP', 'Pulse', 'BODY_MEASURE_COMPOSITE', 'blood_macros',
       'mean_steroid_ng_dl', 'balance_symptom_score', 'balan

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Assuming your dataframe is named 'df'
# df = pd.read_csv('your_nhanes_data.csv')

def analyze_representation_bias(df):
    """
    Analyze demographic representation to identify potential sampling bias
    """
    print("=" * 80)
    print("1. REPRESENTATION BIAS ANALYSIS")
    print("=" * 80)
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group', 'Education_Level', 
                        'Marital_Status', 'Country_of_Birth']
    
    for col in demographic_cols:
        if col in df.columns:
            print(f"\n{col} Distribution:")
            print("-" * 40)
            counts = df[col].value_counts(dropna=False)
            percentages = df[col].value_counts(normalize=True, dropna=False) * 100
            
            result_df = pd.DataFrame({
                'Count': counts,
                'Percentage': percentages.round(2)
            })
            print(result_df)
            
            # Check for severe underrepresentation (< 5%)
            underrepresented = percentages[percentages < 5]
            if len(underrepresented) > 0:
                print(f"\n⚠️  WARNING: Underrepresented groups (< 5%):")
                for group, pct in underrepresented.items():
                    print(f"   - {group}: {pct:.2f}%")


def analyze_outcome_disparities(df):
    """
    Analyze health outcome disparities across demographic groups
    """
    print("\n" + "=" * 80)
    print("3. HEALTH OUTCOME DISPARITY ANALYSIS")
    print("=" * 80)
    
    # Conditions to analyze
    conditions = ['Has_diabetes', 'Has_high_blood_pressure', 'Had_heart_attack', 
                  'Had_Cancer', 'Had_Asthma', 'General_health_condition']
    
    demographic_cols = ['Gender', 'Race_Ethnicity', 'Age_Group']
    
    for condition in conditions:
        if condition not in df.columns:
            continue
            
        print(f"\n{condition}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            if condition == 'General_health_condition':
                # For ordinal health condition, calculate mean rating
                health_order = ['Excellent', 'Very good', 'Good', 'Fair', 'Poor']
                temp_df = df[df[condition].isin(health_order)].copy()
                temp_df['health_score'] = temp_df[condition].map(
                    {h: i for i, h in enumerate(health_order)}
                )
                by_group = temp_df.groupby(demo_col)['health_score'].mean()
                print(f"\n  {demo_col} (lower score = better health):")
                print(f"  {by_group.round(2).to_dict()}")
            else:
                # For binary conditions
                by_group = df.groupby(demo_col)[condition].apply(
                    lambda x: (x == 'Yes').sum() / x.notna().sum() * 100
                )
                print(f"\n  {demo_col} (% with condition):")
                print(f"  {by_group.round(2).to_dict()}")


def analyze_continuous_health_metrics(df):
    """
    Analyze continuous health metrics for disparities
    """
    print("\n" + "=" * 80)
    print("4. CONTINUOUS HEALTH METRICS ANALYSIS")
    print("=" * 80)
    
    metrics = ['SystolicBP', 'DiastolicBP', 'Cholestrol_level', 
               'HDL_mg', 'WBC', 'Haemoglobin']
    demographic_cols = ['Gender', 'Race_Ethnicity']
    
    for metric in metrics:
        if metric not in df.columns:
            continue
            
        print(f"\n{metric}:")
        print("-" * 40)
        
        for demo_col in demographic_cols:
            if demo_col not in df.columns:
                continue
                
            print(f"\n  By {demo_col}:")
            by_group = df.groupby(demo_col)[metric].agg(['mean', 'std', 'count'])
            print(f"  {by_group.round(2)}")
            
            # Perform statistical test (ANOVA)
            groups = [group[metric].dropna() for name, group in df.groupby(demo_col)]
            if len(groups) > 1 and all(len(g) > 0 for g in groups):
                f_stat, p_value = stats.f_oneway(*groups)
                if p_value < 0.05:
                    print(f"  ⚠️  Significant difference detected (p={p_value:.4f})")


def analyze_socioeconomic_bias(df):
    """
    Analyze potential socioeconomic biases
    """
    print("\n" + "=" * 80)
    print("5. SOCIOECONOMIC BIAS ANALYSIS")
    print("=" * 80)
    
    if 'Income_to_Poverty_Ratio' in df.columns:
        # Create income categories
        df_temp = df.copy()
        df_temp['Income_Category'] = pd.cut(
            df_temp['Income_to_Poverty_Ratio'],
            bins=[0, 1.3, 3.5, 10],
            labels=['Below Poverty', 'Low Income', 'Above Low Income']
        )
        
        print("\nIncome Category Distribution:")
        print(df_temp['Income_Category'].value_counts(normalize=True) * 100)
        
        # Health insurance coverage by income
        if 'Covered_by_health_insurance' in df.columns:
            print("\nHealth Insurance Coverage by Income:")
            coverage = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['Covered_by_health_insurance'],
                normalize='index'
            ) * 100
            print(coverage.round(2))
        
        # Health outcomes by income
        if 'General_health_condition' in df.columns:
            print("\nGeneral Health by Income:")
            health = pd.crosstab(
                df_temp['Income_Category'],
                df_temp['General_health_condition'],
                normalize='index'
            ) * 100
            print(health.round(2))
    


# Run the analysis
# generate_bias_report(df)

In [8]:
analyze_representation_bias(df)
analyze_outcome_disparities(df)
analyze_continuous_health_metrics(df)
analyze_socioeconomic_bias(df)

1. REPRESENTATION BIAS ANALYSIS

Gender Distribution:
----------------------------------------
        Count  Percentage
Gender                   
Female   5301       54.33
Male     4456       45.67

Race_Ethnicity Distribution:
----------------------------------------
                                     Count  Percentage
Race_Ethnicity                                        
Non-Hispanic White                    5321       54.54
Non-Hispanic Black                    1277       13.09
Other Race - Including Multi-Racial   1239       12.70
Other Hispanic                        1079       11.06
Mexican American                       841        8.62

Age_Group Distribution:
----------------------------------------
           Count  Percentage
Age_Group                   
61-75       2494       25.56
31-45       1749       17.93
0-18        1680       17.22
46-60       1627       16.68
19-30       1235       12.66
76+          972        9.96

Education_Level Distribution:
----------------

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder

# ============================================================================
# TARGETED OVERSAMPLING FOR UNDERREPRESENTED GROUPS
# ============================================================================

print("Step 1: Targeted oversampling for underrepresented groups...")

# Define target percentages for underrepresented groups
target_percentages = {
    'Race_Ethnicity': {
        'Mexican American': 0.10  # Target 10%
    },
    'Education_Level': {
        'Less than 9th grade': 0.10  # Target 10%
    }
}

df_oversampled = df.copy()

# Oversample Race_Ethnicity groups
for group, target_pct in target_percentages['Race_Ethnicity'].items():
    current_count = len(df[df['Race_Ethnicity'] == group])
    current_pct = current_count / len(df)
    
    if current_pct < target_pct:
        # Calculate how many additional samples needed
        target_count = int(len(df) * target_pct)
        additional_samples = target_count - current_count
        
        # Oversample with replacement
        group_data = df[df['Race_Ethnicity'] == group]
        oversampled = group_data.sample(n=additional_samples, replace=True, random_state=42)
        df_oversampled = pd.concat([df_oversampled, oversampled], ignore_index=True)
        
        print(f"Oversampled '{group}': {current_count} → {target_count} ({current_pct:.2%} → {target_pct:.2%})")

# Oversample Education_Level groups
for group, target_pct in target_percentages['Education_Level'].items():
    current_count = len(df_oversampled[df_oversampled['Education_Level'] == group])
    current_pct = current_count / len(df_oversampled)
    
    if current_pct < target_pct:
        # Calculate based on the new size after previous oversampling
        target_count = int(len(df_oversampled) * target_pct)
        additional_samples = target_count - current_count
        
        # Oversample with replacement
        group_data = df_oversampled[df_oversampled['Education_Level'] == group]
        oversampled = group_data.sample(n=additional_samples, replace=True, random_state=42)
        df_oversampled = pd.concat([df_oversampled, oversampled], ignore_index=True)
        
        print(f"Oversampled '{group}': {current_count} → {target_count} ({current_pct:.2%} → {target_pct:.2%})")

print(f"\nOriginal df shape: {df.shape}")
print(f"Oversampled df shape: {df_oversampled.shape}")

# Verify new distributions
print("\n" + "="*50)
print("NEW DISTRIBUTIONS AFTER OVERSAMPLING")
print("="*50)
print("\nRace_Ethnicity Distribution:")
print(df_oversampled['Race_Ethnicity'].value_counts())
print("\nEducation_Level Distribution:")
print(df_oversampled['Education_Level'].value_counts())

df_oversampled.head()

Step 1: Targeted oversampling for underrepresented groups...
Oversampled 'Mexican American': 841 → 975 (8.62% → 10.00%)
Oversampled 'Less than 9th grade': 388 → 989 (3.92% → 10.00%)

Original df shape: (9757, 46)
Oversampled df shape: (10492, 46)

NEW DISTRIBUTIONS AFTER OVERSAMPLING

Race_Ethnicity Distribution:
Race_Ethnicity
Non-Hispanic White                     5440
Non-Hispanic Black                     1333
Other Race - Including Multi-Racial    1295
Other Hispanic                         1254
Mexican American                       1170
Name: count, dtype: int64

Education_Level Distribution:
Education_Level
Some college or AA degree                             2725
College graduate or above                             2635
High school graduate/GED or equivalent                1744
Unknown/NA                                            1732
Less than 9th grade                                    989
9-11th grade (Includes 12th grade with no diploma)     667
Name: count, dtype: int

Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Gender,Age,Race_Ethnicity,Country_of_Birth,Education_Level,...,DiastolicBP,Pulse,BODY_MEASURE_COMPOSITE,blood_macros,mean_steroid_ng_dl,balance_symptom_score,balance_impact_score,fall_risk_score,functional_difficulty_composite,Age_Group
0,Excellent,Yes,4.7,15.7,259.0,Male,43.0,Other Race - Including Multi-Racial,Unknown/NA,College graduate or above,...,96.0,81.0,0.614546,0.047064,0.351579,0,0.0,0.0,No difficulty,31-45
1,Moderate hearing trouble,Yes,6.3,15.2,221.0,Male,66.0,Non-Hispanic White,Born in 50 US states or Washington,College graduate or above,...,78.666667,72.0,0.508978,1.166786,0.065015,0,0.0,0.0,Some difficulty,61-75
2,Moderate hearing trouble,No,5.7,13.8,235.0,Female,44.0,Other Hispanic,Unknown/NA,High school graduate/GED or equivalent,...,78.333333,81.333333,0.087322,-0.12156,0.070373,6,1.0,3.5,Some difficulty,31-45
3,Good,No,6.8,13.9,264.5,Male,43.0,Mexican American,Unknown/NA,9-11th grade (Includes 12th grade with no dipl...,...,74.333333,72.0,0.318792,-0.04917,0.216401,0,0.0,0.0,No difficulty,31-45
4,Good,No,6.5,14.0,241.0,Female,65.0,Non-Hispanic White,Born in 50 US states or Washington,High school graduate/GED or equivalent,...,74.0,69.333333,0.249125,-0.012353,-0.27409,0,0.0,0.0,No difficulty,61-75


In [10]:
df = df_oversampled.copy()
df.to_csv('Data/sampled_dataset.csv')

## Encoding

In [11]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder


# Nominal categorical → One-Hot
ohe_cols = [
    "Race_Ethnicity",
    "Gender",
    "Country_of_Birth",
    "Marital_Status"
]

# Binary categorical → map to 0/1/2 (Unknown as 2)
binary_cols = [
    "Covered_by_health_insurance",
    # "Had_alcohol_in_the_past",
    "Has_Kidney_Failure",
    "Had_high_blood_pressure",
    "Takes_vitamin_supplements",
    "Tested_for_HIV_Virus",
    "Has_diabetes",
    "Had_Asthma",
    "Had_Arthritis",
    "Had_heart_attack",
    "Had_Thyroid",
    "Had_Liver_COndition",
    "Had_Cancer",
    "Has_Hepatitis",
    "Has_Disability",
    "Received_Hepatitis_A_Vaccine"
]

# Ordinal categorical → ordered encoding + Unknown at the end
ordinal_cols = {
    "Age_Group": [["0-18", "19-30", "31-45", "46-60", "61-75", "76+", "Unknown"]],
    "Education_Level": [["Less than 9th grade",
                         "9-11th grade (Includes 12th grade with no diploma)",
                         "High school graduate/GED or equivalent",
                         "Some college or AA degree",
                         "College graduate or above",
                         "Unknown"]],
    "General_health_condition": [["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"]],
    "General_hearing_condition": [["Deaf", "A lot of trouble", "Moderate hearing trouble",
                                   "A little trouble", "Good", "Excellent", "Unknown"]],
    "functional_difficulty_composite": [['Very Severe Difficulty', 'A lot of difficulty', 'No difficulty', 'Some difficulty','Unknown']],
    "Teeth_and_gum_health": [["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"]]
}

# --------------------------
# Encoding functions
# --------------------------

def encode_dataset(df):
    df_encoded = df.copy()

    # Normalize all Unknown-like responses into "Unknown"
    df_encoded = df_encoded.replace(
        {"Don't know": "Unknown", "Refused": "Unknown", "Not Applicable": "Unknown", "N/A": "Unknown", "Unknown/NA": "Unknown"}
    )

    # Binary encoding: map Yes/No/Unknown
    for col in binary_cols:
        if col == "Has_diabetes":
            df_encoded[col] = df_encoded[col].map({"No": 0, "Yes": 1, "Borderline": 2, "Unknown": 2})
        elif col == "Received_Hepatitis_A_Vaccine":
            df_encoded[col] = df_encoded[col].map({"No doses": 0, "Yes, at least 2 doses": 1, "Less than 2 doses": 1, "Unknown": 2, })
        else:
            df_encoded[col] = df_encoded[col].map({"No": 0, "Yes": 1, "Unknown": 2})

    # Ordinal encoding
    for col, categories in ordinal_cols.items():
        if col in df_encoded.columns:
            encoder = OrdinalEncoder(categories=categories, dtype=int)
            df_encoded[col] = encoder.fit_transform(df_encoded[[col]])

    # One-Hot encoding
    df_encoded = pd.get_dummies(df_encoded, columns=[col for col in ohe_cols if col in df_encoded.columns], drop_first=False, dtype=int)

    return df_encoded

# --------------------------
# Usage
# --------------------------

df_encoded = encode_dataset(df)

print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)
df_encoded.head()


Original shape: (10492, 46)
Encoded shape: (10492, 55)


Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,Has_diabetes,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,5,1,4.7,15.7,259.0,43.0,4,4.0,5.0,0,...,0,1,0,1,0,1,1,0,0,0
1,2,1,6.3,15.2,221.0,66.0,4,2.0,5.0,0,...,0,0,0,1,1,0,1,0,0,0
2,2,0,5.7,13.8,235.0,44.0,2,7.0,1.41,1,...,1,0,1,0,0,1,1,0,0,0
3,4,0,6.8,13.9,264.5,43.0,1,2.0,0.63,0,...,0,0,0,1,0,1,0,1,0,0
4,4,0,6.5,14.0,241.0,65.0,2,2.0,5.0,0,...,0,0,1,0,1,0,1,0,0,0


In [12]:
for col in df_encoded.columns:
    print(f"Column: {col}")
    print(df_encoded[col].unique())
    print("-" * 50)

Column: General_hearing_condition
[5 2 4 3 1 0 6]
--------------------------------------------------
Column: Had_high_blood_pressure
[1 0 2]
--------------------------------------------------
Column: WBC
[ 4.7   6.3   5.7   6.8   6.5   5.5   6.    8.3   5.8   9.7   9.5   5.6
  5.4   6.6  10.    6.2   4.5   8.5   6.7   5.3  10.6   6.4   9.3   4.
  6.45  6.9   8.    8.8   8.1  10.4   8.2  14.3   7.8   5.9   7.    8.7
  7.7   5.   12.1   5.2   7.1   6.1   7.6  10.3  13.3  11.9  11.8   4.9
  7.4   4.8   7.5  13.9  13.4   3.9  10.1   4.6   7.2  13.7  11.3   4.2
  7.3  12.9   5.1  15.4   9.2  10.8  10.5   3.4  15.1   8.4   3.5  12.2
  9.9   7.9  11.7   4.4   9.1   9.6  10.2   9.8   4.1  11.4   8.6  13.
 12.8   9.    4.3  15.6   9.4   8.9   3.   13.2  11.2   2.7  11.1   3.8
 13.5  12.6  10.9  17.4  10.7  11.5  17.5  12.7  11.6  12.    3.7  14.5
  3.6   2.6   3.1  13.1   3.2   2.5   2.9   2.3  11.   12.4   2.2  14.1
 14.   14.2  12.5  12.3   3.3   2.8  15.9  13.8   2.4  14.7  16.   15.3
 16.7 

In [13]:
# verify if encoding is sucessful

for col in binary_cols:
    if col in df_encoded.columns:
        print(col, df_encoded[col].unique())


for col in ordinal_cols.keys():
    if col in df_encoded.columns:
        print(col, df_encoded[col].unique())


[col for col in df_encoded.columns if any(base in col for base in ohe_cols)]

print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)


df = df_encoded

Covered_by_health_insurance [1 0 2]
Has_Kidney_Failure [0 2 1]
Had_high_blood_pressure [1 0 2]
Takes_vitamin_supplements [0 1]
Tested_for_HIV_Virus [0 1 2]
Has_diabetes [0 1 2]
Had_Asthma [0 1 2]
Had_Arthritis [1 0 2]
Had_heart_attack [0 2 1]
Had_Thyroid [0 1 2]
Had_Liver_COndition [0 2 1]
Had_Cancer [0 1 2]
Has_Hepatitis [0 1 2]
Has_Disability [0 1 2]
Received_Hepatitis_A_Vaccine [2 1 0]
Age_Group [2 4 1 3 0 5]
Education_Level [4 2 1 3 5 0]
General_health_condition [4 2 3 1 0 5]
General_hearing_condition [5 2 4 3 1 0 6]
functional_difficulty_composite [2 3 1 0]
Teeth_and_gum_health [4 3 1 0 2 5]
Original shape: (10492, 46)
Encoded shape: (10492, 55)


In [14]:
df.columns

Index(['General_hearing_condition', 'Had_high_blood_pressure', 'WBC',
       'Haemoglobin', 'Platelete', 'Age', 'Education_Level', 'Household_Size',
       'Income_to_Poverty_Ratio', 'Has_diabetes', 'Takes_vitamin_supplements',
       'Has_Disability', 'HDL_mg', 'Has_Hepatitis',
       'Covered_by_health_insurance', 'Tested_for_HIV_Virus',
       'General_health_condition', 'Received_Hepatitis_A_Vaccine',
       'Family_poverty_level_index', 'Has_Kidney_Failure', 'Had_Asthma',
       'Had_Arthritis', 'Had_heart_attack', 'Had_Thyroid',
       'Had_Liver_COndition', 'Had_Cancer', 'Teeth_and_gum_health',
       'Number_of_Moderate_Physical_activities_per_week',
       'Number_of_Vigorous_Physical_activities_per_week',
       'Number_of_hours_of_sleep', 'Cholestrol_level', 'SystolicBP',
       'DiastolicBP', 'Pulse', 'BODY_MEASURE_COMPOSITE', 'blood_macros',
       'mean_steroid_ng_dl', 'balance_symptom_score', 'balance_impact_score',
       'fall_risk_score', 'functional_difficulty_compos

In [15]:

numerical_cols = ["Number_of_Vigorous_Physical_activities_per_week","Number_of_Moderate_Physical_activities_per_week",
                  "mean_steroid_ng_dl","blood_macros","HDL_mg","WBC","Platelete","SystolicBP","Household_Size","Cholestrol_level","Pulse","DiastolicBP","Family_poverty_level_index","Income_to_Poverty_Ratio",
                  "Number_of_hours_of_sleep","BODY_MEASURE_COMPOSITE","Age","Haemoglobin",'balance_symptom_score','balance_impact_score','fall_risk_score']

# Skewness
print("\n=== Skewness of Numerical Columns ===")
print(df[numerical_cols].skew().sort_values(ascending=False))



=== Skewness of Numerical Columns ===
Number_of_Vigorous_Physical_activities_per_week    36.440132
fall_risk_score                                    17.915625
Number_of_Moderate_Physical_activities_per_week    17.605238
mean_steroid_ng_dl                                  4.424904
blood_macros                                        2.974533
balance_impact_score                                2.736562
balance_symptom_score                               2.245220
HDL_mg                                              1.330054
WBC                                                 1.290663
SystolicBP                                          1.005113
Platelete                                           0.964591
Cholestrol_level                                    0.789117
Household_Size                                      0.726165
Pulse                                               0.599069
DiastolicBP                                         0.493675
Family_poverty_level_index                    

In [16]:
# Columns to log-transform
# other columns above 0.5 are binary columns
skewed_cols = ["Number_of_Vigorous_Physical_activities_per_week", "Number_of_Moderate_Physical_activities_per_week", "mean_steroid_ng_dl", 
                            "blood_macros", "HDL_mg", "WBC", "Platelete", "SystolicBP", "Household_Size", "Cholestrol_level", "Pulse",'balance_symptom_score','balance_impact_score','fall_risk_score']

# Apply log1p safely (handles zeros)
for col in skewed_cols:
    if col in df.columns:
        df[col] = np.log1p(df[col].clip(lower=0))

## Scaling

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select numerical (continuous) columns
numerical_cols = ["Number_of_Vigorous_Physical_activities_per_week","Number_of_Moderate_Physical_activities_per_week",
                  "mean_steroid_ng_dl","blood_macros","HDL_mg","WBC","Platelete","SystolicBP","Household_Size","Cholestrol_level","Pulse","DiastolicBP","Family_poverty_level_index","Income_to_Poverty_Ratio",
                  "Number_of_hours_of_sleep","BODY_MEASURE_COMPOSITE","Age","Haemoglobin",'balance_symptom_score','balance_impact_score','fall_risk_score']

# Z-Score method (check how many values > |3|)
from scipy.stats import zscore
outlier_report = {}
for col in numerical_cols:
    z_scores = zscore(df[col].dropna())
    outliers = (abs(z_scores) > 3).sum()
    outlier_report[col] = outliers

print("Outliers detected (Z-score > 3):")
for col, count in outlier_report.items():
    print(f"{col}: {count}")



Outliers detected (Z-score > 3):
Number_of_Vigorous_Physical_activities_per_week: 30
Number_of_Moderate_Physical_activities_per_week: 18
mean_steroid_ng_dl: 150
blood_macros: 262
HDL_mg: 83
WBC: 111
Platelete: 106
SystolicBP: 80
Household_Size: 0
Cholestrol_level: 73
Pulse: 65
DiastolicBP: 86
Family_poverty_level_index: 0
Income_to_Poverty_Ratio: 0
Number_of_hours_of_sleep: 178
BODY_MEASURE_COMPOSITE: 86
Age: 0
Haemoglobin: 118
balance_symptom_score: 36
balance_impact_score: 145
fall_risk_score: 170


In [18]:
#StandardScaler assumes normal-like distribution. Outliers will still pull the mean/std strongly.

# RobustScaler uses median and IQR → much better when outliers exist.


from sklearn.preprocessing import StandardScaler, RobustScaler

# Define ALL columns to scale (numerical + ordinal + binary)

# 1. All numerical continuous columns
numerical_cols = [
    "Number_of_Vigorous_Physical_activities_per_week",
    "Number_of_Moderate_Physical_activities_per_week",
    "mean_steroid_ng_dl", "blood_macros", "HDL_mg", "WBC", 
    "Platelete", "SystolicBP", "Household_Size", "Cholestrol_level",
    "Pulse", "DiastolicBP", "Family_poverty_level_index", 
    "Income_to_Poverty_Ratio", "Number_of_hours_of_sleep",
    "BODY_MEASURE_COMPOSITE", "Age", "Haemoglobin",'balance_symptom_score','balance_impact_score','fall_risk_score'
]

# 2. Ordinal encoded columns (now integers in df_encoded)
ordinal_cols_list = [
    "Age_Group", 
    "Education_Level", 
    "General_health_condition",
    "General_hearing_condition", 
    "functional_difficulty_composite",
    "Teeth_and_gum_health"
]

# 3. Binary encoded columns (now 0/1/2 in df_encoded)
binary_cols_list = [
    "Covered_by_health_insurance",
    "Had_alcohol_in_the_past",
    "Has_Kidney_Failure",
    "Had_high_blood_pressure",
    "Takes_vitamin_supplements",
    "Tested_for_HIV_Virus",
    "Has_diabetes",
    "Had_Asthma",
    "Had_Arthritis",
    "Had_heart_attack",
    "Had_Thyroid",
    "Had_Liver_COndition",
    "Had_Cancer",
    "Has_Hepatitis",
    "Has_Disability",
    "Received_Hepatitis_A_Vaccine"
]

# Combine all columns to scale
cols_to_scale = numerical_cols + ordinal_cols_list + binary_cols_list

# Filter only columns that exist in df_encoded
cols_to_scale = [col for col in cols_to_scale if col in df_encoded.columns]

print(f"Total columns to scale: {len(cols_to_scale)}")
print(f"  - Numerical continuous: {len([c for c in numerical_cols if c in df_encoded.columns])}")
print(f"  - Ordinal encoded: {len([c for c in ordinal_cols_list if c in df_encoded.columns])}")
print(f"  - Binary encoded: {len([c for c in binary_cols_list if c in df_encoded.columns])}")

# Identify one-hot encoded columns (will NOT be scaled)
all_cols = set(df_encoded.columns)
cols_to_scale_set = set(cols_to_scale)
ohe_cols = list(all_cols - cols_to_scale_set)

print(f"  - One-hot encoded (NOT scaled): {len(ohe_cols)}")

# Apply RobustScaler
scaler = RobustScaler()
df_scaled = df_encoded.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

print(f"\n✓ RobustScaler applied successfully!")
print(f"✓ Scaled {len(cols_to_scale)} features")
print(f"✓ One-hot encoded columns remain as 0/1")

Total columns to scale: 42
  - Numerical continuous: 21
  - Ordinal encoded: 6
  - Binary encoded: 15
  - One-hot encoded (NOT scaled): 13

✓ RobustScaler applied successfully!
✓ Scaled 42 features
✓ One-hot encoded columns remain as 0/1


#### Apply Standard Scling as it works well for PCA and check what happens!

In [21]:
df_scaled.to_csv('Data/df_scaled.csv')

### Save all the models and results later for rela time inference

In [None]:
import pickle
import joblib
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler

def save_preprocessing_components(df_encoded, scaler):
    """
    Save preprocessing components using the already fitted scaler
    and creating encoders from the known mappings
    """
    preprocessing_pipeline = {}
    
    # Save the already fitted scaler
    preprocessing_pipeline['scaler'] = scaler
    preprocessing_pipeline['cols_to_scale'] = cols_to_scale
    
    # Save binary mappings
    preprocessing_pipeline['binary_mappings'] = {
        "Has_diabetes": {"No": 0, "Yes": 1, "Borderline": 2, "Unknown": 2},
        "Received_Hepatitis_A_Vaccine": {"No doses": 0, "Yes, at least 2 doses": 1, 
                                         "Less than 2 doses": 1, "Unknown": 2},
        "default": {"No": 0, "Yes": 1, "Unknown": 2}
    }
    
    # Create and save ordinal encoders from known categories
    ordinal_encoders = {}
    ordinal_mappings = {
        "Age_Group": ["0-18", "19-30", "31-45", "46-60", "61-75", "76+", "Unknown"],
        "Education_Level": ["Less than 9th grade",
                           "9-11th grade (Includes 12th grade with no diploma)",
                           "High school graduate/GED or equivalent",
                           "Some college or AA degree",
                           "College graduate or above",
                           "Unknown"],
        "General_health_condition": ["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"],
        "General_hearing_condition": ["Deaf", "A lot of trouble", "Moderate hearing trouble",
                                     "A little trouble", "Good", "Excellent", "Unknown"],
        "functional_difficulty_composite": ['Very Severe Difficulty', 'A lot of difficulty', 
                                           'No difficulty', 'Some difficulty', 'Unknown'],
        "Teeth_and_gum_health": ["Poor", "Fair", "Good", "Very good", "Excellent", "Unknown"]
    }
    
    for col, categories in ordinal_mappings.items():
        encoder = OrdinalEncoder(categories=[categories], dtype=int)
        # Create dummy data to fit the encoder
        dummy_data = [[cat] for cat in categories]
        encoder.fit(dummy_data)
        ordinal_encoders[col] = encoder
    
    preprocessing_pipeline['ordinal_encoders'] = ordinal_encoders
    
    # Save column lists
    preprocessing_pipeline['binary_cols'] = binary_cols
    preprocessing_pipeline['ohe_cols'] = ohe_cols
    preprocessing_pipeline['skewed_cols'] = skewed_cols
    preprocessing_pipeline['all_columns'] = df_encoded.columns.tolist()
    
    # Save one-hot encoded column names
    ohe_column_names = [col for col in df_encoded.columns 
                        if any(base in col for base in ohe_cols)]
    preprocessing_pipeline['ohe_column_names'] = ohe_column_names
    
    # Save to file
    with open('Data/preprocessing_pipeline.pkl', 'wb') as f:
        pickle.dump(preprocessing_pipeline, f)
    
    print("✓ Preprocessing pipeline saved!")
    print(f"  - Saved {len(ordinal_encoders)} ordinal encoders")
    print(f"  - Saved scaler type: {type(scaler).__name__}")
    print(f"  - Total columns in pipeline: {len(df_encoded.columns)}")
    
    return preprocessing_pipeline

# Save your current pipeline (use df_scaled which is your final encoded and scaled data)
pipeline = save_preprocessing_components(df_scaled, scaler)

✓ Preprocessing pipeline saved!
  - Saved 6 ordinal encoders
  - Saved scaler type: RobustScaler
  - Total columns in pipeline: 55


In [33]:
import numpy as np
import pandas as pd

def preprocess_new_record(new_record, pipeline_path='Data/preprocessing_pipeline.pkl'):
    """
    Apply the same preprocessing to new records
    
    Parameters:
    -----------
    new_record : dict or pd.DataFrame
        New record(s) to preprocess
    pipeline_path : str
        Path to saved preprocessing pipeline
        
    Returns:
    --------
    pd.DataFrame : Preprocessed record matching training data format
    """
    
    # Load pipeline
    with open(pipeline_path, 'rb') as f:
        pipeline = pickle.load(f)
    
    # Convert to DataFrame if needed
    if isinstance(new_record, dict):
        df_new = pd.DataFrame([new_record])
    else:
        df_new = new_record.copy()
    
    # Step 1: Normalize Unknown-like responses
    df_new = df_new.replace({
        "Don't know": "Unknown", "Refused": "Unknown", 
        "Not Applicable": "Unknown", "N/A": "Unknown", 
        "Unknown/NA": "Unknown"
    })
    
    # Step 2: Binary encoding
    for col in pipeline['binary_cols']:
        if col not in df_new.columns:
            continue
        
        if col == "Has_diabetes":
            mapping = pipeline['binary_mappings']["Has_diabetes"]
        elif col == "Received_Hepatitis_A_Vaccine":
            mapping = pipeline['binary_mappings']["Received_Hepatitis_A_Vaccine"]
        else:
            mapping = pipeline['binary_mappings']["default"]
        
        df_new[col] = df_new[col].map(mapping)
    
    # Step 3: Ordinal encoding
    for col, encoder in pipeline['ordinal_encoders'].items():
        if col in df_new.columns:
            df_new[col] = encoder.transform(df_new[[col]])
    
    # Step 4: One-hot encoding
    for base_col in pipeline['ohe_cols']:
        if base_col in df_new.columns:
            # Get dummies for this column
            dummies = pd.get_dummies(df_new[base_col], prefix=base_col, dtype=int)
            
            # Add any missing columns from training
            for train_col in pipeline['ohe_column_names']:
                if base_col in train_col and train_col not in dummies.columns:
                    dummies[train_col] = 0
            
            # Remove extra columns not in training
            cols_to_keep = [col for col in dummies.columns 
                           if col in pipeline['ohe_column_names']]
            dummies = dummies[cols_to_keep]
            
            # Add to dataframe
            df_new = pd.concat([df_new.drop(columns=[base_col]), dummies], axis=1)
    
    # Step 5: Apply log transformation to skewed columns
    for col in pipeline['skewed_cols']:
        if col in df_new.columns:
            df_new[col] = np.log1p(df_new[col].clip(lower=0))
    
    # Step 6: Ensure all columns from training exist
    for col in pipeline['all_columns']:
        if col not in df_new.columns:
            df_new[col] = 0  # Add missing columns with default value
    
    # Step 7: Reorder columns to match training data
    df_new = df_new[pipeline['all_columns']]
    
    # Step 8: Apply scaling
    df_new[pipeline['cols_to_scale']] = pipeline['scaler'].transform(
        df_new[pipeline['cols_to_scale']]
    )
    
    return df_new

In [None]:
# Example new patient record
new_patient = {
    'General_hearing_condition': 'Good',
    'Had_high_blood_pressure': 'Yes',
    'WBC': 6.5,
    'Haemoglobin': 14.2,
    'Platelete': 250,
    'Gender': 'Female',
    'Age': 45,
    'Race_Ethnicity': 'Non-Hispanic White',
    'Country_of_Birth': 'Born in 50 US states or Washington',
    'Education_Level': 'College graduate or above',
    'Marital_Status': 'Married/Living with partner',
    'Household_Size': 4,
    'Income_to_Poverty_Ratio': 3.5,
    'Has_diabetes': 'No',
    'Takes_vitamin_supplements': 'Yes',
    'Has_Disability': 'No',
    'HDL_mg': 55,
    'Has_Hepatitis': 'No',
    'Covered_by_health_insurance': 'Yes',
    'Tested_for_HIV_Virus': 'No',
    'General_health_condition': 'Very good',
    'Received_Hepatitis_A_Vaccine': 'No doses',
    'Family_poverty_level_index': 3.5,
    'Has_Kidney_Failure': 'No',
    'Had_Asthma': 'No',
    'Had_Arthritis': 'No',
    'Had_heart_attack': 'No',
    'Had_Thyroid': 'No',
    'Had_Liver_COndition': 'No',
    'Had_Cancer': 'No',
    'Teeth_and_gum_health': 'Good',
    'Number_of_Moderate_Physical_activities_per_week': 3,
    'Number_of_Vigorous_Physical_activities_per_week': 2,
    'Number_of_hours_of_sleep': 7,
    'Cholestrol_level': 180,
    'SystolicBP': 120,
    'DiastolicBP': 80,
    'Pulse': 72,
    'BODY_MEASURE_COMPOSITE': 0.5,
    'blood_macros': 0.2,
    'mean_steroid_ng_dl': 0.3,
    'balance_symptom_score': 0,
    'balance_impact_score': 0,
    'fall_risk_score': 0,
    'functional_difficulty_composite': 'No difficulty',
    'Age_Group': '31-45'
}

# Preprocess the new record
processed_patient = preprocess_new_record(new_patient)

print(f"Shape of processed record: {processed_patient.shape}")
print(f"Matches training data shape: {processed_patient.shape[1] == df_scaled.shape[1]}")

Shape of processed record: (1, 55)
Matches training data shape: True

First 5 processed values:
General_hearing_condition    0.000000
Had_high_blood_pressure      1.000000
WBC                         -0.126195
Haemoglobin                  0.214286
Platelete                   -0.072534
Name: 0, dtype: float64




In [35]:
processed_patient

Unnamed: 0,General_hearing_condition,Had_high_blood_pressure,WBC,Haemoglobin,Platelete,Age,Education_Level,Household_Size,Income_to_Poverty_Ratio,Has_diabetes,...,Race_Ethnicity_Other Hispanic,Race_Ethnicity_Other Race - Including Multi-Racial,Gender_Female,Gender_Male,Country_of_Birth_Born in 50 US states or Washington,Country_of_Birth_Unknown,Marital_Status_Married/Living with partner,Marital_Status_Never married,Marital_Status_Unknown,Marital_Status_Widowed/Divorced/Separated
0,0.0,1.0,-0.126195,0.214286,-0.072534,-0.075,0.5,0.436829,0.340996,0.0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Save the preprocessed patient data to a file (after preprocessing in the first notebook)
# Add this at the end of your preprocessing notebook:
processed_patient.to_csv('Data/processed_new_patient.csv', index=False)