In [35]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/raw/samples.csv')

df['value_clean'] = df['value'].replace(['error', 'bad_reading', 'N/A'], np.nan)

df['value_clean'] = df['value_clean'].str.replace(',', '.', regex=False)

df['value_num'] = pd.to_numeric(df['value_clean'], errors='coerce')

group_by_product = df.groupby(["product", "parameter"])["value_num"].agg([
    'count', 'mean', 'std', 'min', 'max', 'median'])

df['date_dt'] = pd.to_datetime(df['date'], format="%Y-%m-%d", errors="coerce")

nan_view = df[df['value_num'].isna()][['sample_id', 'product', 'parameter', 'value', 'unit', 'date_dt']]

unit_rules = {
    'Water': 'mg/kg',
    'Sulfur': 'mg/kg',
    'Chloride': 'mg/kg',
    'Ash': '% m/m',
    'Viscosity': 'cSt'
}

df['expected_unit'] = df['parameter'].map(unit_rules)

df['unit_ok'] = df['unit'] == df['expected_unit']

false_rows = df.loc[df['unit_ok'] == False, ['parameter', 'unit', 'expected_unit']]
#print(false_rows)

unit_issues = df[["sample_id", "product", "parameter", "value",
                       "value_num", "unit", "expected_unit", "date_dt"]]

print(unit_issues)



     sample_id   product  parameter        value  value_num   unit  \
0           44     JetA1  Viscosity        137.2   137.2000    cSt   
1          101      HSFO        Ash       0.0136     0.0136  % m/m   
2           86     JetA1        Ash  bad_reading        NaN  % m/m   
3           28      HSFO  Viscosity        239.1   239.1000    cSt   
4           37     JetA1   Chloride        14.31    14.3100  mg/kg   
..         ...       ...        ...          ...        ...    ...   
115         75     JetA1     Sulfur       280.93   280.9300  mg/kg   
116          4      HSFO        Ash       0.0224     0.0224  % m/m   
117          3     JetA1   Chloride         4.68     4.6800  mg/kg   
118        119    Diesel      Water       139.99   139.9900  mg/kg   
119         39  Gasoline  Viscosity        error        NaN    cSt   

    expected_unit    date_dt  
0             cSt 2026-01-02  
1           % m/m 2026-01-07  
2           % m/m 2026-01-26  
3             cSt 2026-01-26  
4   