In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
df = pd.read_csv('../Data/cleaned_data.csv')
df.head()

Unnamed: 0,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Total Cholesterol (mg/dL),HDL (mg/dL),Fasting Blood Sugar (mg/dL),Smoking Status,...,Physical Activity Level,Family History of CVD,CVD Risk Level,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score
0,F,32.0,69.1,1.71,23.6,86.2,248.0,78.0,111.0,N,...,Low,N,INTERMEDIARY,171.0,0.504,125.0,79.0,Elevated,140.0,17.93
1,F,55.0,118.7,1.69,41.6,82.5,162.0,50.0,135.0,Y,...,High,Y,HIGH,169.0,0.488,139.0,70.0,Hypertension Stage 1,82.0,20.51
2,M,46.0,86.6145,1.83,25.9,106.7,103.0,73.0,114.0,N,...,High,Y,INTERMEDIARY,183.0,0.583,104.0,77.0,Normal,0.0,12.64
3,M,44.0,108.3,1.8,33.4,96.6,134.0,46.0,91.0,N,...,High,Y,INTERMEDIARY,175.694,0.537,140.0,83.0,Hypertension Stage 1,58.0,16.36
4,F,32.0,99.5,1.86,28.8,102.7,146.0,64.0,141.0,Y,...,High,N,INTERMEDIARY,186.0,0.552,144.0,83.0,Hypertension Stage 1,52.0,17.88


# Clinical Sanity Checks & Edge Cases to Handle 

## Blood Pressure Consistency

1. Systolic BP > Diastolic BP

> Always true medically

In [27]:
df[df['Systolic BP'] > df['Diastolic BP']].shape

(1463, 21)

In [28]:
df[df['Systolic BP'] < df['Diastolic BP']].shape

(60, 21)

Let's not consider records where Systolic BP is less than Diastolic BP

In [29]:
df = df[df['Systolic BP'] > df['Diastolic BP']]

2. Realistic BP Ranges

| Measurement   |   Min   |   Max   |   
|---------------|---------|---------|
| Systolic      |    80   |  250    |
| Diastolic     |    40   |  150    |

In [30]:
df[
    (df['Systolic BP'].between(80, 250)) &
    (df['Diastolic BP'].between(40, 150))
]


Unnamed: 0,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Total Cholesterol (mg/dL),HDL (mg/dL),Fasting Blood Sugar (mg/dL),Smoking Status,...,Physical Activity Level,Family History of CVD,CVD Risk Level,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score
0,F,32.0,69.1000,1.710,23.6,86.200000,248.0,78.0,111.0,N,...,Low,N,INTERMEDIARY,171.000,0.504,125.0,79.0,Elevated,140.0,17.930
1,F,55.0,118.7000,1.690,41.6,82.500000,162.0,50.0,135.0,Y,...,High,Y,HIGH,169.000,0.488,139.0,70.0,Hypertension Stage 1,82.0,20.510
2,M,46.0,86.6145,1.830,25.9,106.700000,103.0,73.0,114.0,N,...,High,Y,INTERMEDIARY,183.000,0.583,104.0,77.0,Normal,0.0,12.640
3,M,44.0,108.3000,1.800,33.4,96.600000,134.0,46.0,91.0,N,...,High,Y,INTERMEDIARY,175.694,0.537,140.0,83.0,Hypertension Stage 1,58.0,16.360
4,F,32.0,99.5000,1.860,28.8,102.700000,146.0,64.0,141.0,Y,...,High,N,INTERMEDIARY,186.000,0.552,144.0,83.0,Hypertension Stage 1,52.0,17.880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1523,M,63.0,91.6250,1.702,31.6,105.592094,189.0,76.0,153.0,N,...,Moderate,N,INTERMEDIARY,175.694,0.601,162.0,60.0,Hypertension Stage 2,83.0,20.936
1524,F,40.0,72.0700,1.889,20.2,95.326000,157.0,60.0,93.0,N,...,Moderate,Y,LOW,188.894,0.505,119.0,66.0,Normal,67.0,14.300
1526,M,39.0,98.6260,1.521,42.6,77.193000,237.0,82.0,147.0,N,...,High,N,INTERMEDIARY,152.119,0.507,150.0,90.0,Hypertension Stage 2,125.0,18.251
1527,M,71.0,116.1630,1.841,34.3,114.197000,193.0,84.0,123.0,N,...,High,Y,INTERMEDIARY,184.059,0.620,112.0,63.0,Normal,79.0,15.316


3. BP Category Must Match Readings

Mapping:

- Normal: <120 / <80

- Elevated: 120–129 / <80

- Hypertension Stage 1: 130–139 / 80–89

- Hypertension Stage 2: ≥140 / ≥90

In [32]:
def validate_bp_category(row):
    sbp = row['Systolic BP']
    dbp = row['Diastolic BP']

    if sbp < 120 and dbp < 80:
        return 'Normal'
    elif 120 <= sbp < 130 and dbp < 80:
        return 'Elevated'
    elif (130 <= sbp < 140) or (80 <= dbp < 90):
        return 'Hypertension Stage 1'
    elif (140 <= sbp) or (90 <= dbp):
        return 'Hypertension Stage 2'
    else:
        return 'Hypertensive Crisis'

In [33]:
df.apply(validate_bp_category, axis=1).value_counts()

Hypertension Stage 2    566
Hypertension Stage 1    497
Normal                  300
Elevated                100
Name: count, dtype: int64

> No mismatch found

## Height, Weight & BMI Consistency

4. Height(cm) <-> Height(m)

In [37]:
df[df['Height (cm)'] == df['Height (m)']*100].shape

(895, 21)

In [49]:
df[
    abs(df['Height (cm)'] - df['Height (m)']*100) < 26
].shape

(1463, 21)

5. BMI Formula Must Hold

BMI = Weight/height<sup>2

In [52]:
df['BMI_calculated'] = df['Weight (kg)'] / (df['Height (m)']**2)
df[df['BMI'] == df['BMI_calculated'].round(1)].shape

(1463, 22)

6. Vaild Ranges

| Feature       |   Range | 
|---------------|---------|
| Height (m)      |1.2-2.2   | 
| Weight (kg)     | 30-200  | 
| BMI           |   10-60 |

In [53]:
df[
    df['Height (m)'].between(1.2, 2.2) &
    df['Weight (kg)'].between(30, 200) &
    df['BMI'].between(10, 60)
].shape

(1463, 22)

> Nothing abnormal

## Cholesterol & Lipid Profile Rules

7. HDL < Total Cholesterol

In [55]:
df[df['HDL (mg/dL)'] < df['Total Cholesterol (mg/dL)']].shape == df.shape

True

8. LDL < Total Cholesterol

In [65]:
df[df['Estimated LDL (mg/dL)'] < df['Total Cholesterol (mg/dL)']].shape

(1454, 22)

In [67]:
df[df['Estimated LDL (mg/dL)'] >= df['Total Cholesterol (mg/dL)']][['Estimated LDL (mg/dL)', 'HDL (mg/dL)','Total Cholesterol (mg/dL)']]

Unnamed: 0,Estimated LDL (mg/dL),HDL (mg/dL),Total Cholesterol (mg/dL)
85,109.0,79.0,102.0
232,109.0,42.0,109.0
389,226.0,37.0,197.0
468,109.0,67.0,101.0
487,206.0,55.0,197.0
826,202.0,37.0,197.0
996,210.0,47.0,197.0
1246,109.0,37.0,108.0
1511,209.0,33.0,197.0


In [68]:
df = df[df['Estimated LDL (mg/dL)'] < df['Total Cholesterol (mg/dL)']]

9. Realistic Liqid Ranges

| Feature           | Range   |
| ----------------- | ------- |
| Total Cholesterol | 100–400 |
| HDL               | 20–100  |
| LDL               | 50–250  |


In [79]:
df[
    df['Total Cholesterol (mg/dL)'].between(100, 400) &
    df['HDL (mg/dL)'].between(20, 100) &
    df['Estimated LDL (mg/dL)'].between(50, 250)
].shape 

(1198, 22)

In [83]:
df[~df['Estimated LDL (mg/dL)'].between(50, 250)]['Estimated LDL (mg/dL)']

2        0.0
15      44.0
22      43.0
25      47.0
39       7.0
        ... 
1467    43.0
1468    26.0
1500    45.0
1506    -3.0
1514    36.0
Name: Estimated LDL (mg/dL), Length: 256, dtype: float64

In [99]:
# Keep clinically valid LDL
df = df[df['Estimated LDL (mg/dL)'] >= 30]

## Waist to Height Ratio Logic

11. Ratio Formula

In [105]:
(df['Abdominal Circumference (cm)'] / df['Height (cm)']).between(0.3, 0.7).shape[0] == df.shape[0]

True

## Age-Based Sanity Checks

12. Age vs Risk

If:

- Age < 30 → High risk should be rare

- Age > 60 → High risk likely

In [106]:
df.groupby('CVD Risk Level')['Age'].mean()

CVD Risk Level
HIGH            47.500750
INTERMEDIARY    45.238095
LOW             48.354497
Name: Age, dtype: float64

In [107]:
df['Age'].describe()

count    1360.000000
mean       46.780882
std        11.907193
min        25.000000
25%        37.000000
50%        46.000000
75%        55.000000
max        79.000000
Name: Age, dtype: float64

## Risk Score & Risk Level Alignment

13. Score <-> Level

| Risk Level | Score Range |
| ---------- | ----------- |
| Low        | 0–10        |
| Medium     | 10–20       |
| High       | >20         |


In [114]:
df.groupby('CVD Risk Level')['CVD Risk Score'].aggregate(['mean','max','min','median'])

Unnamed: 0_level_0,mean,max,min,median
CVD Risk Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HIGH,17.726161,24.17,11.25,17.63
INTERMEDIARY,16.270586,23.51,10.53,16.133444
LOW,17.147066,23.797,10.86,17.006419


## Smoking & Age Rule (Soft Check)

Smoking status for age < 12 -> invaild

In [116]:
df[~((df['Age'] < 12) & (df['Smoking Status'] == 1))].shape[0] == df.shape[0]

True

In [117]:
df.to_csv('../Data/cvd_validated_dataset.csv', index=False)