### 1. Import Necessary Libraries & Load Dataset

In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("cardio_train.csv",sep=";")
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


### 2. Convert Age Column(into Years)

In [4]:
df['age_years'] = (df['age'] / 365).round(1).astype(float)
df['age_years'].head()

0    50.4
1    55.4
2    51.7
3    48.3
4    47.9
Name: age_years, dtype: float64

### 3. Handle Missing Values

In [5]:
df.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
age_years      0
dtype: int64

### 4. Outlier Detection

In [7]:
def flag_iqr_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return ((series < lower) | (series > upper)).astype(int)

continuous_cols = [
    'age_years', 'height', 'weight', 
    'ap_hi', 'ap_lo'
]

outlier_flags = pd.DataFrame({
    col: flag_iqr_outliers(df[col]) for col in continuous_cols
})

outlier_counts = outlier_flags.sum().sort_values(ascending=False)
outlier_counts

ap_lo        4632
weight       1819
ap_hi        1435
height        519
age_years       4
dtype: int64

### 5. Handle Outliers

In [14]:
def iqr_method(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    # print(f"Lower = {lower} and Upper = {upper}")
    
    return series.clip(lower, upper)

for col in continuous_cols:
    df[col] = iqr_method(df[col])

### 6. Train_Test_Split

In [15]:
from sklearn.model_selection import train_test_split

X = df.drop(['id','age','cardio'],axis=1)
y = df['cardio']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print(f"Training Shape: {X_train.shape}")
print(f"Testing Shape: {X_test.shape}")

Training Shape: (56000, 11)
Testing Shape: (14000, 11)


### 7. Scaling Numerical Data

#### fit_transform() learns parameters and applies transformation, while transform() only applies the learned transformation.

We use fit_transform() on training data to learn parameters and transform() on test data to avoid data leakage.

In [17]:
from sklearn.preprocessing import MinMaxScaler

num_cols = ['age_years','height','weight','ap_hi','ap_lo']

scaler = MinMaxScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])


### 8. Encoding Categorical Columns

#### Already Encoded No Need to Encode Further.

In [18]:
cat_cols = ['gender','cholesterol','gluc','smoke','alco','active']

def find_unique_value_counts(series):
    print(f"{df[series].value_counts()}")
    
for col in cat_cols:
    find_unique_value_counts(col)    

gender
1    45530
2    24470
Name: count, dtype: int64
cholesterol
1    52385
2     9549
3     8066
Name: count, dtype: int64
gluc
1    59479
3     5331
2     5190
Name: count, dtype: int64
smoke
0    63831
1     6169
Name: count, dtype: int64
alco
0    66236
1     3764
Name: count, dtype: int64
active
1    56261
0    13739
Name: count, dtype: int64
