In [3]:
from labpulse.cleaning import prepare_base_df

# ============================================================
# 01 — Load & Data Quality Audit (LabPulse)
#
# Purpose:
# - Load raw data from data/raw/samples.csv
# - Audit data quality (missing values, invalid readings)
# - Validate measurement units (QC rules)
# - Summarize data quality issues before analysis
#
# IMPORTANT:
# Raw data is NEVER modified on disk.
# All transformations are performed in-memory only.
# ============================================================


# ============================================================
# 1) Load raw data (baseline prepared df)
# ============================================================

df = prepare_base_df("../data/raw/samples.csv")


# ============================================================
# 2) Quick audit — missing numeric measurements + date parsing
# ============================================================

nan_count = df["value_num"].isna().sum()
nat_count = df["date_dt"].isna().sum()

nan_view = df.loc[df["value_num"].isna(), ["sample_id", "product", "parameter", "value", "unit", "date"]]

print("value_num NaN:", nan_count)
print("date_dt NaT:", nat_count)
print("nan_view rows:", nan_view.shape[0])


# ============================================================
# 3) Unit validation (QC rules)
# ============================================================

unit_rules = {
    "Water": "mg/kg",
    "Sulfur": "mg/kg",
    "Chloride": "mg/kg",
    "Ash": "% m/m",
    "Viscosity": "cSt",
}

df["expected_unit"] = df["parameter"].map(unit_rules)
df["unit_ok"] = df["unit"] == df["expected_unit"]


# ============================================================
# 4) Unit mismatch analysis
# ============================================================

unit_mismatches = df.loc[df["unit_ok"] == False]

false_by_parameter = (
    unit_mismatches.groupby("parameter")
    .size()
    .reset_index(name="error_count")
    .sort_values("error_count", ascending=False)
)

false_by_unit = (
    unit_mismatches.groupby("unit")
    .size()
    .reset_index(name="error_count")
)

false_by_parameter, false_by_unit


# ============================================================
# 5) Conclusions — Units
# ============================================================

# Findings:
# - X records were detected with incorrect measurement units.
# - The most frequently affected parameter is ...
# - The most common incorrect unit is ...
# - This indicates a systemic issue in unit reporting rather than random errors.


# ============================================================
# 6) Data Audit Summary
# ============================================================

# The dataset contains the following quality issues:
# - non-numeric entries in the measurement column,
# - missing measurement values,
# - inconsistent units for selected parameters.
#
# Before further analysis, it is necessary to:
# - exclude records with incorrect units,
# - work only with validated numeric values,
# - detect and analyze outliers.
#
# This concludes the initial data quality audit.




value_num NaN: 6
date_dt NaT: 0
nan_view rows: 6
