### 1. Import Necessary Libraries & Load Dataset

In [1]:
import pandas as pd

In [None]:
dataFrame = pd.read_csv("../dataset/cardio_train.csv", sep=';')
dataFrame

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


### 2. Convert Age Column Into Years

In [3]:
dataFrame['age_years'] = (dataFrame['age'] / 365).round(1).astype(float)
dataFrame['age_years']

0        50.4
1        55.4
2        51.7
3        48.3
4        47.9
         ... 
69995    52.7
69996    61.9
69997    52.2
69998    61.5
69999    56.3
Name: age_years, Length: 70000, dtype: float64

### 3. Handle Missing Values

In [4]:
dataFrame.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
age_years      0
dtype: int64

### 4. Outlier Detection

In [5]:
def flag_iqr_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - ( 1.5 * iqr )
    upper = q3 + ( 1.5 * iqr )
    
    return ((series < lower) | (series > upper)).astype(int)

continuous_cols = [ 'age_years', 'height', 'weight', 'ap_lo', 'ap_hi' ]

outlier_flags = pd.DataFrame({
    col: flag_iqr_outliers(dataFrame[col]) for col in continuous_cols
})

outliers_counts = outlier_flags.sum().sort_values(ascending=False)
outliers_counts

ap_lo        4632
weight       1819
ap_hi        1435
height        519
age_years       4
dtype: int64

### 5. Handle Outliers

In [6]:
def iqr_method(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - ( 1.5 * IQR )
    upper = Q3 + ( 1.5 * IQR )
    
    return series.clip(lower, upper)

for col in continuous_cols:
    dataFrame[col] = iqr_method(dataFrame[col])

### 6. Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

X = dataFrame.drop(['id', 'age', 'cardio'], axis=1)
Y = dataFrame['cardio']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"Training Shape : {x_train.shape}")
print(f"testing Shape : {x_test.shape}")

Training Shape : (56000, 11)
testing Shape : (14000, 11)


### 7. Scaling Numerical Data

In [8]:
from sklearn.preprocessing import StandardScaler

num_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo']

scaler = StandardScaler()

x_train[num_cols] = scaler.fit_transform(x_train[num_cols])
x_test[num_cols] = scaler.transform(x_test[num_cols])

### 8. Encoding Categorical Column

In [9]:
cat_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

def find_unique_value_counts(series):
    print(f"{dataFrame[series].value_counts()}")
    
for col in cat_cols:
    find_unique_value_counts(col)

gender
1    45530
2    24470
Name: count, dtype: int64
cholesterol
1    52385
2     9549
3     8066
Name: count, dtype: int64
gluc
1    59479
3     5331
2     5190
Name: count, dtype: int64
smoke
0    63831
1     6169
Name: count, dtype: int64
alco
0    66236
1     3764
Name: count, dtype: int64
active
1    56261
0    13739
Name: count, dtype: int64


### Handle Duplicate Values

In [11]:
dataFrame = dataFrame.drop('id',axis=1)
print(f"Duplicate Records Before:{dataFrame.duplicated().sum()}")
dataFrame = dataFrame.drop_duplicates().reset_index(drop=True)
print(f"Duplicate Records After:{dataFrame.duplicated().sum()}")

Duplicate Records Before:24
Duplicate Records After:0
