In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [70]:
df = pd.read_csv('cardio_train.csv',sep=';')

In [71]:
# Assuming 'df' is loaded
print(df.info())
print(df.head())

# Check for null values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB
None
   id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  18393       2     168    62.0    110     80            1     1      0   
1   1  20228       1     156    85.0    140  

In [72]:
# Convert age from days to years (int)
df['age_years'] = (df['age'] / 365.25).round().astype(int)

In [73]:
def clean_bp_outliers(df):
    """
    Filters out extreme blood pressure values based on clinically reasonable ranges.
    """
    # Define realistic bounds (e.g., ap_hi between 80 and 200, ap_lo between 50 and 120)
    df = df[(df['ap_hi'] > 70) & (df['ap_hi'] < 250)]
    df = df[(df['ap_lo'] > 40) & (df['ap_lo'] < 140)]

    # Also ensure ap_hi is always greater than ap_lo
    df = df[df['ap_hi'] >= df['ap_lo']]
    
    return df

df = clean_bp_outliers(df)
print(f"Data shape after cleaning outliers: {df.shape}")


Data shape after cleaning outliers: (68635, 14)


In [74]:
df = df.drop(['id'],axis=1)

In [75]:
unique_duplicate_rows = df[df.duplicated(keep=False)].drop_duplicates()
unique_duplicate_rows

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
1142,17493,2,169,74.0,120,80,1,1,0,0,1,1,48
1204,16793,1,165,68.0,120,80,1,1,0,0,1,0,46
1568,21945,1,165,60.0,120,80,1,1,0,0,1,0,60
1612,20293,1,162,70.0,110,70,1,1,0,0,1,0,56
2305,20495,1,165,70.0,120,80,1,1,0,0,1,0,56
2677,22077,1,175,69.0,120,80,1,1,0,0,1,1,60
6325,14552,1,158,64.0,120,80,1,1,0,0,1,0,40
8190,21778,1,160,58.0,120,80,1,1,0,0,1,0,60
10494,16937,2,170,70.0,120,80,1,1,0,0,0,0,46
10777,18988,1,164,65.0,120,80,1,1,0,0,1,0,52


In [76]:
data_cleane = df.drop_duplicates(keep='first')
data_cleane

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0,53
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1,62
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1,52
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1,61


In [79]:
df = data_cleane.copy()

In [80]:
df

Unnamed: 0,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi
0,2,110,80,1,1,0,0,1,0,50,22.0
1,1,140,90,3,1,0,0,1,1,55,34.9
2,1,130,70,3,1,0,0,0,1,52,23.5
3,2,150,100,1,1,0,0,1,1,48,28.7
4,1,100,60,1,1,0,0,0,0,48,23.0
...,...,...,...,...,...,...,...,...,...,...,...
69995,2,120,80,1,1,1,0,1,0,53,26.9
69996,1,140,90,2,2,0,0,1,1,62,50.5
69997,2,180,90,3,1,0,1,0,1,52,31.4
69998,1,135,80,1,2,0,0,0,1,61,27.1


In [82]:
# Show columns
print(df.columns)

# Compute BMI only if columns exist
if 'weight' in df.columns and 'height' in df.columns:
    df['bmi'] = df['weight'] / ((df['height'] / 100)**2)
    df['bmi'] = df['bmi'].round(1)

# Drop after BMI is made
df = df.drop(columns=['height', 'weight', 'age'], errors='ignore')

df

Index(['gender', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco',
       'active', 'cardio', 'age_years', 'bmi'],
      dtype='object')


Unnamed: 0,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi
0,2,110,80,1,1,0,0,1,0,50,22.0
1,1,140,90,3,1,0,0,1,1,55,34.9
2,1,130,70,3,1,0,0,0,1,52,23.5
3,2,150,100,1,1,0,0,1,1,48,28.7
4,1,100,60,1,1,0,0,0,0,48,23.0
...,...,...,...,...,...,...,...,...,...,...,...
69995,2,120,80,1,1,1,0,1,0,53,26.9
69996,1,140,90,2,2,0,0,1,1,62,50.5
69997,2,180,90,3,1,0,1,0,1,52,31.4
69998,1,135,80,1,2,0,0,0,1,61,27.1


In [84]:
df['bmi'].min()

3.5

In [85]:
df['bmi'].max()

298.7

In [86]:
df

Unnamed: 0,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi
0,2,110,80,1,1,0,0,1,0,50,22.0
1,1,140,90,3,1,0,0,1,1,55,34.9
2,1,130,70,3,1,0,0,0,1,52,23.5
3,2,150,100,1,1,0,0,1,1,48,28.7
4,1,100,60,1,1,0,0,0,0,48,23.0
...,...,...,...,...,...,...,...,...,...,...,...
69995,2,120,80,1,1,1,0,1,0,53,26.9
69996,1,140,90,2,2,0,0,1,1,62,50.5
69997,2,180,90,3,1,0,1,0,1,52,31.4
69998,1,135,80,1,2,0,0,0,1,61,27.1
