In [15]:
from itertools import groupby

import pandas as pd
import numpy as np

#01 – Load & Data Quality Audit

# Notebook Purpose:
# - Load raw data
# - Audit data quality (missing data, types, units)
# - Identify QC issues before analysis

df = pd.read_csv('../data/raw/samples.csv')

#Data was loaded from `data/raw/samples.csv`. The dataset contains 120 records and 6 columns.

df['value_clean'] = df['value'].replace(['error', 'bad_reading', 'N/A'], np.nan)

df['value_clean'] = df['value_clean'].str.replace(',', '.', regex=False)

df['value_num'] = pd.to_numeric(df['value_clean'], errors='coerce')

# In the `value` column, text entries (`error`, `bad_reading`, `N/A`) were detected,
# which were replaced with NaN values. A total of 6 missing numeric data were detected.

group_by_product = df.groupby(["product", "parameter"])["value_num"].agg([
    'count', 'mean', 'std', 'min', 'max', 'median'])

df['date_dt'] = pd.to_datetime(df['date'], format="%Y-%m-%d", errors="coerce")

all_correct_data = df['date_dt'].isna().sum()

# The `date` column was successfully converted to datetime type. No invalid date formats were detected.

# print(all_correct_data)

nan_view = df[df['value_num'].isna()][['sample_id', 'product', 'parameter', 'value', 'unit', 'date_dt']]

unit_rules = {
    'Water': 'mg/kg',
    'Sulfur': 'mg/kg',
    'Chloride': 'mg/kg',
    'Ash': '% m/m',
    'Viscosity': 'cSt'
}

df['expected_unit'] = df['parameter'].map(unit_rules)

df['unit_ok'] = df['unit'] == df['expected_unit']

false_rows = df.loc[df['unit_ok'] == False, ['parameter', 'unit', 'expected_unit']]
#print(false_rows)

unit_issues = df[["sample_id", "product", "parameter", "value",
                       "value_num", "unit", "expected_unit", "date_dt"]]

# print(unit_issues)
# print(unit_issues.shape)
# print(unit_issues.head())

false_by_parameter = df[df['unit_ok'] == False].groupby('parameter').size().reset_index(name='error_count')
false_by_parameter = false_by_parameter.sort_values('error_count', ascending=False)

false_by_unit = df[df['unit_ok'] == False].groupby('unit').size().reset_index(name='error_count')
print(false_by_parameter)
print(false_by_unit)

## Conclusions – Units

# - 5 records were detected with incorrect units.
# - The most common incorrect parameter is **Water**.
# - The most common incorrect unit is **mg/L** instead of **mg/kg**.
# - This indicates a systemic problem with the units for this parameter.

## Data Audit Summary

# The data contains:
# - text entries in the value column,
# - missing measurements,
# - inconsistent units for selected parameters.
#
# Before further analysis, it is necessary to:
# - filter out incorrect units,
# - work only with numerical values,
# - detect outliers.


  parameter  error_count
2     Water            3
0       Ash            1
1  Chloride            1
   unit  error_count
0  mg/L            4
1   ppm            1
