In [2]:
import pandas as pd
import numpy as np

# ============================================================
# 01 — Load & Data Quality Audit (LabPulse)
#
# Purpose:
# - Load raw data from data/raw/samples.csv
# - Audit data quality (types, missing values, invalid readings)
# - Validate measurement units (QC rules)
# - Summarize data quality issues before analysis
#
# IMPORTANT:
# Raw data is NEVER modified on disk.
# All transformations are performed in-memory only.
# ============================================================


# ============================================================
# 1) Load raw data
# ============================================================

df = pd.read_csv('../data/raw/samples.csv')

# Basic dataset shape (rows, columns)
print(df.shape)


# ============================================================
# 2) Raw data overview
# ============================================================

# Display column names to understand dataset structure
print(df.columns)

# Each row represents a single laboratory measurement
# for a given product and parameter.


# ============================================================
# 3) Measurement value audit (value -> value_num)
# ============================================================

# The 'value' column may contain invalid or non-numeric readings
# such as: 'error', 'bad_reading', 'N/A'.
# These indicate failed or missing measurements.

# Replace known invalid tokens with NaN
df['value_clean'] = df['value'].replace(
    ['error', 'bad_reading', 'N/A'],
    np.nan
)

# Normalize decimal separator (future-proof for real-world data)
df['value_clean'] = (
    df['value_clean']
    .astype(str)
    .str.replace(',', '.', regex=False)
)

# Convert cleaned values to numeric
# Any remaining invalid values will become NaN (safe conversion)
df['value_num'] = pd.to_numeric(df['value_clean'], errors='coerce')

# Count missing numeric measurements
df['value_num'].isna().sum()


# ============================================================
# 4) Inspect rows with missing numeric values
# ============================================================

nan_view = df[df['value_num'].isna()][
    ['sample_id', 'product', 'parameter', 'value', 'unit', 'date']
]

print(nan_view)


# ============================================================
# 5) Date column audit
# ============================================================

# Convert date column to datetime type
df['date_dt'] = pd.to_datetime(
    df['date'],
    format='%Y-%m-%d',
    errors='coerce'
)

# Count invalid dates (NaT)
df['date_dt'].isna().sum()

# Result:
# - All dates were successfully parsed
# - No invalid date formats detected


# ============================================================
# 6) Unit validation (QC rules)
# ============================================================

# Define expected measurement units for each parameter
unit_rules = {
    'Water': 'mg/kg',
    'Sulfur': 'mg/kg',
    'Chloride': 'mg/kg',
    'Ash': '% m/m',
    'Viscosity': 'cSt'
}

# Map expected unit to each record
df['expected_unit'] = df['parameter'].map(unit_rules)

# Validate units:
# unit_ok == True only if the unit matches the expected rule
df['unit_ok'] = df['unit'] == df['expected_unit']


# ============================================================
# 7) Unit mismatch analysis
# ============================================================

# Count unit mismatches per parameter
false_by_parameter = (
    df[df['unit_ok'] == False]
    .groupby('parameter')
    .size()
    .reset_index(name='error_count')
    .sort_values('error_count', ascending=False)
)

# Count unit mismatches per unit
false_by_unit = (
    df[df['unit_ok'] == False]
    .groupby('unit')
    .size()
    .reset_index(name='error_count')
)

false_by_parameter, false_by_unit


# ============================================================
# 8) Conclusions — Units
# ============================================================

# Findings:
# - 5 records were detected with incorrect measurement units.
# - The most frequently affected parameter is Water.
# - The most common incorrect unit is mg/L instead of mg/kg.
# - This indicates a systemic issue in unit reporting rather than random errors.


# ============================================================
# 9) Data Audit Summary
# ============================================================

# The dataset contains the following quality issues:
# - non-numeric entries in the measurement column,
# - missing measurement values,
# - inconsistent units for selected parameters.
#
# Before further analysis, it is necessary to:
# - exclude records with incorrect units,
# - work only with validated numeric values,
# - detect and analyze outliers.
#
# This concludes the initial data quality audit.



(120, 6)
Index(['sample_id', 'product', 'parameter', 'value', 'unit', 'date'], dtype='str')
     sample_id   product  parameter        value   unit        date
2           86     JetA1        Ash  bad_reading  % m/m  2026-01-26
38          60      HSFO  Viscosity          NaN    cSt  2026-01-01
44         110  Gasoline        Ash          NaN  % m/m  2026-01-20
58          79      HSFO      Water          NaN  mg/kg  2026-01-23
83          78    Diesel      Water          NaN  mg/kg  2026-01-27
119         39  Gasoline  Viscosity        error    cSt  2026-01-21


(  parameter  error_count
 2     Water            3
 0       Ash            1
 1  Chloride            1,
    unit  error_count
 0  mg/L            4
 1   ppm            1)