# Data Quality Checks

**Purpose:** Verify data correctness before analysis

**Tasks:**
- Check for missing values
- Find impossible values (age = -10, heart rate = 900)
- Identify duplicates

In [2]:
import pandas as pd
import numpy as np
import sqlite3
# import matplotlib.pyplot as plt
# import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
# sns.set_style('whitegrid')

## Load Database

**Important:** Load data BEFORE the train/test split in Transformation.py

In [3]:
# Connect to database and load full dataset
conn = sqlite3.connect('../databases/nhanes_1st.db')
df = pd.read_sql_query('SELECT * FROM raw_dataset', conn)
conn.close()
print(f"Dataset shape: {df.shape}")

Dataset shape: (56893, 29)


## 1. Missing Values Analysis

In [4]:
# Analyze missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing': missing, 'Percent': missing_pct})
missing_df = missing_df[missing_df['Missing'] > 0].sort_values('Percent', ascending=False)

print('Missing Values Summary:\n')
print('' + missing_df.to_string())

Missing Values Summary:

                            Missing    Percent
smoking_status                42401  74.527622
high_glucose_mg_dl            40538  71.253054
high_triglycerides_mg_dl      40023  70.347846
alcohol_drinks_per_week       31919  56.103563
has_cardiovascular_disease    22796  40.068198
liver_ast_U_L                 22461  39.479374
potassium_mmol_L              22438  39.438947
liver_alt_U_L                 22390  39.354578
bilirubin_mg_dl               22387  39.349305
uric_acid_mg_dl               22385  39.345789
liver_ggt_U_L                 22383  39.342274
creatinine_mg_dl              22381  39.338759
sodium_mmol_L                 22337  39.261421
heart_rate_bpm                19715  34.652769
high_blood_pressure           17989  31.619004
cholesterol_mg_dl             17557  30.859684
low_hdl_mg_dl                 17557  30.859684
platelets_count               12955  22.770815
white_blood_cells_count       12955  22.770815
mean_corpuscular_volume_fL    12955

In [5]:
# Highlight high missing rates
high_missing = missing_df[missing_df['Percent'] > 50]
print(f'\nColumns with >50% missing: {len(high_missing)}')
if len(high_missing) > 0:
    print('' + high_missing.to_string())


Columns with >50% missing: 4
                          Missing    Percent
smoking_status              42401  74.527622
high_glucose_mg_dl          40538  71.253054
high_triglycerides_mg_dl    40023  70.347846
alcohol_drinks_per_week     31919  56.103563


## 2. Impossible Values Detection

In [6]:
# Define valid ranges for clinical measurements
rules = {
    'age': (0, 115),
    'body_mass_index': (10, 100),
    'height_cm': (50, 250),
    'heart_rate_bpm': (30, 200),
    'white_blood_cells_count': (0, 50),
    'platelets_count': (20, 1000),
    'hemoglobin_g_dl': (5, 20),
    'creatinine_mg_dl': (0.3, 15),
    'liver_ast_U_L': (5, 500),
    'bilirubin_mg_dl': (0.1, 20),
    'liver_ggt_U_L': (5, 500),
    'uric_acid_mg_dl': (1, 15),
    'sodium_mmol_L': (120, 160),
    'potassium_mmol_L': (2.5, 6.0),
    'cholesterol_mg_dl': (50, 500),
}

# Check for impossible values
results = []
for col, (min_val, max_val) in rules.items():
    total = df[col].notna().sum()
    impossible = df[(df[col] < min_val) | (df[col] > max_val)][col]
    if len(impossible) > 0:
        results.append({
            'Column': col,
            'Impossible': len(impossible),
            'Total': total,
            # 'Range': f'[{min_val}, {max_val}]'
        })

if results:
    impossible_df = pd.DataFrame(results)
    print('Impossible values found:\n')
    print(impossible_df.to_string(index=False))
else:
    print('No impossible values found!')

Impossible values found:

                 Column  Impossible  Total
white_blood_cells_count           6  43938
        platelets_count           5  43938
       creatinine_mg_dl          13  34512
          liver_ast_U_L           3  34432
        bilirubin_mg_dl          63  34506
          liver_ggt_U_L          73  34510
        uric_acid_mg_dl           6  34508
          sodium_mmol_L           3  34556
       potassium_mmol_L           6  34455
      cholesterol_mg_dl           5  39336


## 3. Duplicate Records

In [7]:
# Check for duplicate rows
duplicates = df[df.duplicated(keep=False)]
print(f'Total duplicate rows: {len(duplicates)} out of {len(df)}')

Total duplicate rows: 20489 out of 56893


## Summary

In [9]:
print('='*60)
print('SUMMARY')
print('='*60)
print(f'Total records: {len(df):,}')
print(f'Columns with missing values: {len(missing_df)}')
print(f'Columns with impossible values: {len(results) if results else 0}')
print(f'Duplicate rows: {len(duplicates)}')

SUMMARY
Total records: 56,893
Columns with missing values: 26
Columns with impossible values: 10
Duplicate rows: 20489
