In [1]:
# ============================================================
# CA6000 (Kaggle PS S5E12) — Data Cleaning & Preprocessing
# Output: X_train_proc, X_val_proc, y_train, y_val, X_test_proc
# ============================================================

import os
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

SEED = 42
TARGET_COL = "diagnosed_diabetes"
ID_COL = "id"

In [2]:

# ----------------------------
# 1) Robust path resolver (Kaggle / Colab / local / /mnt/data)
# ----------------------------
from pathlib import Path

def resolve_dataset_paths(prefer_dir="/content"):
    candidates = [Path(prefer_dir), Path("/mnt/data"), Path(".")]

    kaggle_input = Path("/kaggle/input")
    if kaggle_input.exists():
        candidates.append(kaggle_input)

    def find_file(root: Path, filename: str):
        direct = root / filename
        if direct.exists():
            return direct
        hits = list(root.rglob(filename))
        return hits[0] if hits else None

    train_path = test_path = sub_path = None
    for root in candidates:
        tp = find_file(root, "train.csv")
        te = find_file(root, "test.csv")
        ss = find_file(root, "sample_submission.csv")
        if tp is not None and te is not None:
            train_path, test_path, sub_path = tp, te, ss
            break

    if train_path is None or test_path is None:
        raise FileNotFoundError("Cannot find train.csv/test.csv under preferred dirs.")

    return str(train_path), str(test_path), (str(sub_path) if sub_path else None)

TRAIN_PATH, TEST_PATH, SUB_PATH = resolve_dataset_paths("/content")
print(TRAIN_PATH, TEST_PATH, SUB_PATH)

/content/train.csv /content/test.csv /content/sample_submission.csv


In [13]:
# ----------------------------
# 2) Load data
# ----------------------------
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print("\nShapes:")
print("train:", train_df.shape)
print("test :", test_df.shape)
print("\nTrain head:")
display(train_df.head(3))


Shapes:
train: (700000, 26)
test : (300000, 25)

Train head:


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [14]:
# ----------------------------
# 3) Data audit & sanity checks (good for your report)
# ----------------------------
def basic_audit(train_df: pd.DataFrame, test_df: pd.DataFrame):
    # Required columns
    assert TARGET_COL in train_df.columns, f"Missing target '{TARGET_COL}' in train.csv"
    assert ID_COL in train_df.columns and ID_COL in test_df.columns, "Missing 'id' in train/test"

    # Column alignment (except target)
    train_features = [c for c in train_df.columns if c != TARGET_COL]
    assert set(train_features) == set(test_df.columns), "Train features != Test columns (schema mismatch)"

    # ID uniqueness
    assert train_df[ID_COL].is_unique, "Train id is not unique"
    assert test_df[ID_COL].is_unique, "Test id is not unique"

    # Duplicates
    dup_train = train_df.duplicated().sum()
    dup_test = test_df.duplicated().sum()

    # Missing summary
    miss_train = (train_df.isnull().mean().sort_values(ascending=False))
    miss_test = (test_df.isnull().mean().sort_values(ascending=False))

    # Target check
    y = train_df[TARGET_COL]
    # Ensure binary-like
    unique_y = sorted(y.dropna().unique().tolist())

    print("\n[Audit] duplicates:", {"train": int(dup_train), "test": int(dup_test)})
    print("[Audit] top missing rate (train):")
    print(miss_train.head(10))
    print("[Audit] top missing rate (test):")
    print(miss_test.head(10))
    print("[Audit] target unique values:", unique_y)
    print("[Audit] target distribution:\n", y.value_counts(dropna=False))

basic_audit(train_df, test_df)

# Convert target to int (0/1)
train_df[TARGET_COL] = train_df[TARGET_COL].astype(int)


[Audit] duplicates: {'train': 0, 'test': 0}
[Audit] top missing rate (train):
id                                    0.0
age                                   0.0
alcohol_consumption_per_week          0.0
physical_activity_minutes_per_week    0.0
diet_score                            0.0
sleep_hours_per_day                   0.0
screen_time_hours_per_day             0.0
bmi                                   0.0
waist_to_hip_ratio                    0.0
systolic_bp                           0.0
dtype: float64
[Audit] top missing rate (test):
id                                    0.0
age                                   0.0
alcohol_consumption_per_week          0.0
physical_activity_minutes_per_week    0.0
diet_score                            0.0
sleep_hours_per_day                   0.0
screen_time_hours_per_day             0.0
bmi                                   0.0
waist_to_hip_ratio                    0.0
systolic_bp                           0.0
dtype: float64
[Audit] target uni

In [15]:
# ----------------------------
# 4) Define column groups
# ----------------------------
# Categorical columns (object/string)
cat_cols = train_df.select_dtypes(include=["object"]).columns.tolist()

# Binary columns (known 0/1 flags in this dataset)
bin_cols = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]
bin_cols = [c for c in bin_cols if c in train_df.columns]

# Numeric columns = all numeric excluding id/target/binary
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c not in [ID_COL, TARGET_COL] + bin_cols]

print("\nColumn groups:")
print("num_cols:", num_cols)
print("bin_cols:", bin_cols)
print("cat_cols:", cat_cols)


Column groups:
num_cols: ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']
bin_cols: ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
cat_cols: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


In [6]:
# Optional: verify binary columns truly contain only 0/1
for c in bin_cols:
    bad_vals = set(train_df[c].dropna().unique()) - {0, 1}
    if bad_vals:
        raise ValueError(f"Binary col '{c}' has unexpected values: {bad_vals}")

In [16]:
# ----------------------------
# 5) (Optional but nice) Range check for numeric columns
# ----------------------------
def numeric_range_report(df: pd.DataFrame, columns):
    desc = df[columns].describe(percentiles=[0.01, 0.5, 0.99]).T
    # Keep a compact view
    return desc[["min", "1%", "50%", "99%", "max", "mean", "std"]].sort_values("max", ascending=False)

range_report = numeric_range_report(train_df, num_cols)
print("\nNumeric range report (top 8 by max):")
display(range_report.head(8))



Numeric range report (top 8 by max):


Unnamed: 0,min,1%,50%,99%,max,mean,std
physical_activity_minutes_per_week,1.0,16.0,71.0,304.0,747.0,80.230803,51.195071
triglycerides,31.0,67.0,123.0,187.0,290.0,123.08185,24.739397
cholesterol_total,117.0,150.0,187.0,225.0,289.0,186.818801,16.730832
ldl_cholesterol,51.0,61.0,103.0,148.0,205.0,102.905854,19.022416
systolic_bp,91.0,93.0,116.0,141.0,163.0,116.294193,11.01039
diastolic_bp,51.0,60.0,75.0,91.0,104.0,75.440924,6.825775
heart_rate,42.0,54.0,70.0,86.0,101.0,70.167749,6.938722
hdl_cholesterol,21.0,35.0,54.0,73.0,90.0,53.823214,8.266545


In [8]:
# ----------------------------
# 6) Split data BEFORE fitting preprocessors (avoid leakage)
# ----------------------------
X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL].values.astype(np.int32)

train_ids = X[ID_COL].values
test_ids = test_df[ID_COL].values

X = X.drop(columns=[ID_COL])
X_test = test_df.drop(columns=[ID_COL])

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=SEED,
    stratify=y
)

print("\nSplit shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val  :", X_val.shape,   "y_val  :", y_val.shape)
print("X_test :", X_test.shape)



Split shapes:
X_train: (560000, 24) y_train: (560000,)
X_val  : (140000, 24) y_val  : (140000,)
X_test : (300000, 24)


In [9]:
# ----------------------------
# 7) Custom transformer: quantile clipping for numeric outliers
#    (fit on training only)
# ----------------------------
class QuantileClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower_q=0.005, upper_q=0.995):
        self.lower_q = lower_q
        self.upper_q = upper_q

    def fit(self, X, y=None):
        X = np.asarray(X, dtype=float)
        self.lower_ = np.nanquantile(X, self.lower_q, axis=0)
        self.upper_ = np.nanquantile(X, self.upper_q, axis=0)
        return self

    def transform(self, X):
        X = np.asarray(X, dtype=float)
        return np.clip(X, self.lower_, self.upper_)

In [10]:
# ----------------------------
# 8) Build preprocessing pipeline
#    - numeric: median impute -> clip -> standardize
#    - binary : most_frequent impute (keep 0/1)
#    - cate   : most_frequent impute -> one-hot
# ----------------------------
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("clipper", QuantileClipper(lower_q=0.005, upper_q=0.995)),
    ("scaler", StandardScaler())
])

binary_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("bin", binary_pipe, bin_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

In [11]:
# ----------------------------
# 9) Fit on train, transform val/test
# ----------------------------
X_train_proc = preprocess.fit_transform(X_train)
X_val_proc = preprocess.transform(X_val)
X_test_proc = preprocess.transform(X_test)

# Cast to float32 for neural nets
X_train_proc = X_train_proc.astype(np.float32)
X_val_proc = X_val_proc.astype(np.float32)
X_test_proc = X_test_proc.astype(np.float32)

print("\nProcessed shapes:")
print("X_train_proc:", X_train_proc.shape)
print("X_val_proc  :", X_val_proc.shape)
print("X_test_proc :", X_test_proc.shape)

# Safety checks
assert not np.isnan(X_train_proc).any(), "NaNs remain in X_train_proc"
assert not np.isnan(X_val_proc).any(), "NaNs remain in X_val_proc"
assert not np.isnan(X_test_proc).any(), "NaNs remain in X_test_proc"



Processed shapes:
X_train_proc: (560000, 42)
X_val_proc  : (140000, 42)
X_test_proc : (300000, 42)


In [12]:
# ----------------------------
# 10) Save artifacts for reproducibility
# ----------------------------
artifact = {
    "id_col": ID_COL,
    "target_col": TARGET_COL,
    "num_cols": num_cols,
    "bin_cols": bin_cols,
    "cat_cols": cat_cols,
    "preprocess": preprocess,
}

joblib.dump(artifact, "preprocess_artifact.joblib")
print("\nSaved preprocess artifact -> preprocess_artifact.joblib")

# Optional: save processed arrays (may be large, enable if you want)
# np.save("X_train_proc.npy", X_train_proc)
# np.save("X_val_proc.npy", X_val_proc)
# np.save("X_test_proc.npy", X_test_proc)
# np.save("y_train.npy", y_train)
# np.save("y_val.npy", y_val)

print("\n✅ Ready for model training stage:")
print("Use X_train_proc, y_train, X_val_proc, y_val, X_test_proc")


Saved preprocess artifact -> preprocess_artifact.joblib

✅ Ready for model training stage:
Use X_train_proc, y_train, X_val_proc, y_val, X_test_proc
