## 1️⃣ Setup & Imports

In [10]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


## 2️⃣ Utility Functions

In [11]:

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def is_binary_like(s: pd.Series) -> bool:
    vals = s.dropna().unique()
    if len(vals) == 2:
        return True
    lowered = pd.Series(vals).astype(str).str.lower().unique()
    return set(lowered).issubset({"yes","no","true","false","positive","negative","pos","neg","y","n","1","0"})

def guess_target(df: pd.DataFrame):
    common = [
        "Outcome","outcome","target","Target","label","Label","class","Class",
        "diabetes","Diabetes","has_diabetes","diabetic","Diabetic"
    ]
    for c in common:
        if c in df.columns:
            return c
    return None


## 3️⃣ Load the Dataset

In [12]:

# Path to your CSV file
raw_data_path = "../data/raw/diabetes_dataset_E.csv"

df = pd.read_csv(raw_data_path)
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")

# Guess target column if not specified
target_col = guess_target(df)
print(f"Target column detected: {target_col}")


Dataset loaded with 100000 rows and 28 columns.
Target column detected: diabetes


## 4️⃣ Exploratory Data Analysis (EDA)

In [13]:

def run_eda(df: pd.DataFrame, reports_dir: str, target_col: str = None):
    ensure_dir(reports_dir)

    # Save dtypes
    dtypes = df.dtypes.astype(str).rename("dtype").reset_index().rename(columns={"index":"column"})
    dtypes.to_csv(os.path.join(reports_dir, "01_dtypes.csv"), index=False)

    # Save missing values report
    miss = df.isna().sum().rename("missing_count").reset_index().rename(columns={"index":"column"})
    miss["missing_pct"] = (miss["missing_count"] / len(df) * 100).round(2)
    miss.to_csv(os.path.join(reports_dir, "02_missing_values.csv"), index=False)

    # Save numeric summary
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    if num_cols:
        desc = df[num_cols].describe().T.reset_index().rename(columns={"index":"column"})
        desc.to_csv(os.path.join(reports_dir, "03_numeric_describe.csv"), index=False)

    # Target distribution
    if target_col and target_col in df.columns:
        tgt = df[target_col].value_counts(dropna=False).rename_axis(target_col).reset_index(name="count")
        tgt["percent"] = (tgt["count"] / len(df) * 100).round(2)
        tgt.to_csv(os.path.join(reports_dir, "04_target_distribution.csv"), index=False)

    # Histograms for numeric columns
    num_cols = num_cols[:6]
    for c in num_cols:
        plt.figure()
        df[c].hist(bins=30)
        plt.title(f"Histogram - {c}")
        plt.savefig(os.path.join(reports_dir, f"hist_{c}.png"))
        plt.close()

reports_dir = "../data/reports"
run_eda(df, reports_dir=reports_dir, target_col=target_col)
print(f"EDA reports saved to {reports_dir}")


EDA reports saved to ../data/reports


## 5️⃣ Data Preprocessing

In [14]:

def preprocess(df: pd.DataFrame, outdir: str, target_col: str = None):
    ensure_dir(outdir)

    df_clean = df.copy()

    # Remove duplicates
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)

    # Identify columns
    num_cols = [c for c in df_clean.columns if pd.api.types.is_numeric_dtype(df_clean[c]) and c != target_col]
    obj_cols = [c for c in df_clean.columns if df_clean[c].dtype == "object" and c != target_col]

    # Fill missing values
    for c in num_cols:
        df_clean[c] = df_clean[c].fillna(df_clean[c].median())
    for c in obj_cols:
        df_clean[c] = df_clean[c].fillna(df_clean[c].mode()[0] if not df_clean[c].mode().empty else "Unknown")

    # Save human-readable version
    readable_path = os.path.join(outdir, "diabetes_preprocessed_readable.csv")
    df_clean.to_csv(readable_path, index=False)

    # Create numeric, ML-ready version
    df_encoded = pd.get_dummies(df_clean, columns=obj_cols, drop_first=False)
    if num_cols:
        scaler = StandardScaler()
        df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

    numeric_path = os.path.join(outdir, "diabetes_preprocessed_numeric.csv")
    df_encoded.to_csv(numeric_path, index=False)

    return readable_path, numeric_path

outdir = "../data/processed"
readable_path, numeric_path = preprocess(df, outdir=outdir, target_col=target_col)

print(f"Human-readable CSV saved at: {readable_path}")
print(f"Numeric ML-ready CSV saved at: {numeric_path}")


Human-readable CSV saved at: ../data/processed\diabetes_preprocessed_readable.csv
Numeric ML-ready CSV saved at: ../data/processed\diabetes_preprocessed_numeric.csv


## 6️⃣ Verify Outputs

In [15]:

# Load preprocessed files for preview
df_readable = pd.read_csv(readable_path)
df_numeric = pd.read_csv(numeric_path)

print("Readable Data Sample:")
display(df_readable.head())

print("Numeric Data Sample:")
display(df_numeric.head())


Readable Data Sample:


Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,bmi_category,age_group,physical_activity,diet_pattern,sleep_hours,alcohol_intake,family_history,medication_use,gestational_history,urban_rural,region_income,environmental_risk,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,Overweight,Adult,low,balanced,4,none,0,0,0.0,urban,low,7,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,Normal,Adult,high,balanced,9,none,0,1,1.0,urban,medium,9,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,Normal,Adult,moderate,balanced,5,occasional,1,0,0.0,urban,medium,10,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,Overweight,Middle-aged,moderate,balanced,6,regular,1,0,0.0,rural,low,2,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,Normal,Middle-aged,low,balanced,5,none,0,0,0.0,rural,medium,6,0


Numeric Data Sample:


Unnamed: 0,year,age,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,bmi,hbA1c_level,blood_glucose_level,sleep_hours,family_history,medication_use,gestational_history,environmental_risk,diabetes,gender_Female,gender_Male,gender_Other,location_Alabama,location_Alaska,location_Arizona,location_Arkansas,location_California,location_Colorado,location_Connecticut,location_Delaware,location_District of Columbia,location_Florida,location_Georgia,location_Guam,location_Hawaii,location_Idaho,location_Illinois,location_Indiana,location_Iowa,location_Kansas,location_Kentucky,location_Louisiana,location_Maine,location_Maryland,location_Massachusetts,location_Michigan,location_Minnesota,location_Mississippi,location_Missouri,location_Montana,location_Nebraska,location_Nevada,location_New Hampshire,location_New Jersey,location_New Mexico,location_New York,location_North Carolina,location_North Dakota,location_Ohio,location_Oklahoma,location_Oregon,location_Pennsylvania,location_Puerto Rico,location_Rhode Island,location_South Carolina,location_South Dakota,location_Tennessee,location_Texas,location_United States,location_Utah,location_Vermont,location_Virgin Islands,location_Virginia,location_Washington,location_West Virginia,location_Wisconsin,location_Wyoming,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,bmi_category_Normal,bmi_category_Obese,bmi_category_Overweight,bmi_category_Underweight,age_group_Adult,age_group_Child,age_group_Middle-aged,age_group_Senior,physical_activity_high,physical_activity_low,physical_activity_moderate,diet_pattern_balanced,diet_pattern_high-carb,diet_pattern_low-carb,alcohol_intake_none,alcohol_intake_occasional,alcohol_intake_regular,urban_rural_rural,urban_rural_urban,region_income_high,region_income_low,region_income_medium
0,1.218511,-0.439045,-0.503482,-0.500234,-0.498062,-0.498249,2.000125,-0.284439,-0.202578,-0.000116,-0.49269,-0.934905,-1.459842,-0.654856,-0.420825,-0.248498,0.519513,0,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,True,False,False,True,False,False,False,True,False,True,False
1,-2.49832,-0.572279,-0.503482,1.999063,-0.498062,-0.498249,-0.499969,-0.284439,-0.202578,-1.110599,-0.49269,-1.180558,1.467007,-0.654856,2.376284,4.02417,1.215245,0,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True
2,-2.49832,-1.060805,-0.503482,-0.500234,-0.498062,-0.498249,2.000125,-0.284439,-0.202578,-0.536523,-0.67949,0.539009,-0.874472,1.527053,-0.420825,-0.248498,1.563111,0,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,True,True,False,False,False,True,False,False,True,False,False,True
3,-2.49832,-0.039342,-0.503482,-0.500234,2.007783,-0.498249,-0.499969,-0.284439,-0.202578,-0.000116,-1.426688,0.514444,-0.289102,1.527053,-0.420825,-0.248498,-1.219816,0,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,True,False,False,False,False,True,True,False,False,True,False
4,-1.754954,0.449184,1.986168,-0.500234,-0.498062,-0.498249,-0.499969,-0.284439,-0.202578,-0.538029,0.908306,-1.180558,-0.874472,-0.654856,-0.420825,-0.248498,0.171647,0,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,True,False,True,False,False,True,False,False,True,False,False,False,True
