In [14]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/raw/samples.csv')

df['value_clean'] = df['value'].replace(['error', 'bad_reading', 'N/A'], np.nan)
df['value_num'] = pd.to_numeric(df['value_clean'], errors='coerce')
df['date_dt'] = pd.to_datetime(df['date'], format="%Y-%m-%d", errors="coerce")

unit_rules = {
    'Water': 'mg/kg',
    'Sulfur': 'mg/kg',
    'Chloride': 'mg/kg',
    'Ash': '% m/m',
    'Viscosity': 'cSt'
}

df['expected_unit'] = df['parameter'].map(unit_rules)

df['unit_ok'] = df['unit'] == df['expected_unit']

df_analysis = df[(df['unit_ok'] == True) & (df['value_num'].notna())].copy()

print(df.shape)
print(df_analysis.shape)

def detect_iqr_outliers(group):
    q1 = group['value_num'].quantile(0.25)
    q3 = group['value_num'].quantile(0.75)
    iqr = q3 - q1

    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    out = group[(group['value_num'] < lower) | (group['value_num'] > upper)].copy()
    out['iqr_lower'] = lower
    out['iqr_upper'] = upper

    out['parameter'] = group.name
    out['iqr'] = iqr

    return out

outliers = (
    df_analysis
    .groupby('parameter', group_keys=False)
    .apply(detect_iqr_outliers)
)
outliers = outliers.reset_index(drop=True)


print(outliers.shape)
outliers[['sample_id', 'product', 'parameter', 'value_num', 'unit', 'date_dt']].head(10)

print(outliers['parameter'].value_counts())

outliers_summary = (
    outliers
    .groupby('parameter')
    .agg(outlier_count=('sample_id', 'count'),
         min_outlier=('value_num', 'min'),
         max_outlier=('value_num', 'max'))
    .sort_values('outlier_count', ascending=False)
)

outliers['severity'] = np.where(
    outliers['value_num'] > outliers['iqr_upper'],
    (outliers['value_num'] - outliers['iqr_upper']) / outliers['iqr'],
    (outliers['iqr_lower'] - outliers['value_num']) / outliers['iqr']
)

outliers['severity_level'] = pd.cut(
    outliers['severity'],
    bins=[-np.inf, 1, 3, np.inf],
    labels=['low', 'medium', 'high']
)


outliers.to_csv('../data/processed/outliers_iqr.csv', index=False)

alerts_outliers = outliers[[
    'sample_id', 'product', 'parameter', 'value_num', 'unit', 'date_dt',
    'iqr_lower', 'iqr_upper', 'severity', 'severity_level'
]].sort_values(['severity_level', 'severity'], ascending=[False, False])

alerts_outliers.to_csv('../data/processed/alerts_outliers_iqr.csv', index=False)


alerts_outliers.head(10)

print(outliers_summary)

print("COLUMNS:", outliers.columns)
print("INDEX NAMES:", outliers.index.names)




(120, 11)
(109, 11)
(4, 14)
parameter
Water        2
Ash          1
Viscosity    1
Name: count, dtype: int64
           outlier_count  min_outlier  max_outlier
parameter                                         
Water                  2       825.00       825.00
Ash                    1         0.36         0.36
Viscosity              1       810.00       810.00
COLUMNS: Index(['sample_id', 'product', 'value', 'unit', 'date', 'value_clean',
       'value_num', 'date_dt', 'expected_unit', 'unit_ok', 'iqr_lower',
       'iqr_upper', 'parameter', 'iqr', 'severity', 'severity_level'],
      dtype='str')
INDEX NAMES: [None]
