In [2]:
from pathlib import Path
from labpulse.cleaning import prepare_base_df, filter_numeric_rows
from labpulse.qc_rules import apply_unit_rules

# ============================================================
# 02 — Cleaning & Standardization (LabPulse)
#
# Purpose:
# - Build a clean dataset for analysis:
#   * valid numeric measurements (value_num not NaN)
#   * valid measurement units (unit_ok == True)
# - Save cleaned dataset to data/processed/
#
# Output:
# - data/processed/samples_cleaned.csv
# ============================================================

RAW_PATH = Path("../data/raw/samples.csv")
OUT_PATH = Path("../data/processed/samples_cleaned.csv")

# ============================================================
# 1) Load baseline prepared df
# ============================================================

df_raw = prepare_base_df(str(RAW_PATH))
df_raw = apply_unit_rules(df_raw)

print("raw rows/cols:", df_raw.shape)
df_raw.head()

# ============================================================
# 2) Drop invalid numeric measurements
# ============================================================

df_num = filter_numeric_rows(df_raw)

print("numeric rows/cols:", df_num.shape)
print("NaN value_num after filter:", df_num["value_num"].isna().sum())

# (optional) quick look at removed rows count
removed_numeric = df_raw.shape[0] - df_num.shape[0]
print("removed (non-numeric):", removed_numeric)

# ============================================================
# 3) Drop unit mismatches
# ============================================================

df_clean = df_num.loc[df_num["unit_ok"]].copy()
df_clean.reset_index(drop=True, inplace=True)

print("clean rows/cols:", df_clean.shape)
print("unit_ok False in clean:", (~df_clean["unit_ok"]).sum())

removed_units = df_num.shape[0] - df_clean.shape[0]
print("removed (unit mismatch):", removed_units)

df_clean.head()

# ============================================================
# 4) Save cleaned dataset
# ============================================================

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_clean.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)

# ============================================================
# 5) Final sanity checks
# ============================================================

assert df_clean["value_num"].isna().sum() == 0, "value_num still contains NaN"
assert (~df_clean["unit_ok"]).sum() == 0, "unit_ok still contains False"
assert df_clean["date_dt"].isna().sum() == 0, "date_dt contains invalid dates"

print("Sanity checks passed ✅")



raw rows/cols: (120, 11)
numeric rows/cols: (114, 11)
NaN value_num after filter: 0
removed (non-numeric): 6
clean rows/cols: (109, 11)
unit_ok False in clean: 0
removed (unit mismatch): 5
Saved: ..\data\processed\samples_cleaned.csv
Sanity checks passed ✅
