In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
data= pd.read_csv("train.csv")

In [39]:
data['BP'].unique()

array([152, 125, 160, 134, 140, 138, 130, 120, 150, 108, 110, 178, 124,
        94, 112, 128, 118, 100, 105, 172, 180, 145, 132, 142, 122, 135,
       136, 126, 106, 101, 115, 156, 170, 146, 192, 102, 117, 148, 104,
       200, 165, 129, 174, 123, 144, 158, 133, 103, 147, 155, 149, 109,
       168, 111, 154, 127, 114, 116, 175, 141, 131, 162,  99,  96,  95,
       184], dtype=int64)

In [40]:
def bp_flag(age, sbp):
    """
    age: years (int or float)
    sbp: systolic blood pressure (mm Hg)
    """

    # --- Determine age-based normal SBP ---
    if age < 40:
        normal_upper = 119
    elif age < 60:
        normal_upper = 124
    else:
        normal_upper = 139

    # --- BP category based on systolic ---
    if sbp < normal_upper:
        return 'normal'
    elif sbp < 129:
        return 'elevated'
    elif sbp < 139:
        return 'stage_1'
    elif sbp < 180:
        return 'stage_2'
    else:
        return 'hypertensive_crisis'

In [41]:
data['bp_flag'] = data.apply(
    lambda x: bp_flag(x['Age'], x['BP']),
    axis=1
)

In [42]:
data.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease,bp_flag
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence,stage_2
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence,elevated
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence,stage_2
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence,stage_1
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence,stage_2


In [43]:
# map bp_flag to numerical values
bp_flag_mapping = {
    'normal': 1,
    'elevated': 2,
    'stage_1': 3,
    'stage_2': 4,
    'hypertensive_crisis': 5
}

data['bp_flag'] = data['bp_flag'].map(bp_flag_mapping)

In [44]:
data['Heart Disease'] = data['Heart Disease'].map({'Absence': 0, 'Presence': 1})

In [45]:
#corrleation of bp_flag with target
correlation = data['bp_flag'].corr(data['Heart Disease'])
print(f"Correlation between bp_flag and Heart Disease: {correlation}")

Correlation between bp_flag and Heart Disease: -0.02141552208330856


In [47]:
def cholesterol_flag(chol):
    """
    chol: total cholesterol (mg/dL)
    """
    if chol <= 170:
        return 1
    elif chol < 200:
        return 2
    else:
        return 3

data['chol_flag'] = data['Cholesterol'].apply(cholesterol_flag)
correlation_chol = (data['chol_flag'] / data['bp_flag']).corr(data['Heart Disease'])
print(f"Correlation between chol_flag and Heart Disease: {correlation_chol}")

Correlation between chol_flag and Heart Disease: 0.042105715251220865


In [48]:
data['HR_reserve'] = data['Max HR'] - (220 - data['Age'])
data['HR_ratio']   = data['Max HR'] / (220 - data['Age'])

correlation_hr_reserve = data['HR_reserve'].corr(data['Heart Disease'])
print(f"Correlation between HR_reserve and Heart Disease: {correlation_hr_reserve}")
correlation_hr_ratio = data['HR_ratio'].corr(data['Heart Disease'])
print(f"Correlation between HR_ratio and Heart Disease: {correlation_hr_ratio}")

Correlation between HR_reserve and Heart Disease: -0.3327562691470124
Correlation between HR_ratio and Heart Disease: -0.3433827399849322


In [49]:
st_log = np.log1p(data['ST depression'])
st_squared = data['ST depression'] ** 2

data['ST_log'] = st_log
data['ST_squared'] = st_squared

correlation_st_log = data['ST_log'].corr(data['Heart Disease'])
print(f"Correlation between ST_log and Heart Disease: {correlation_st_log}")
correlation_st_squared = data['ST_squared'].corr(data['Heart Disease'])
print(f"Correlation between ST_squared and Heart Disease: {correlation_st_squared}")

Correlation between ST_log and Heart Disease: 0.4399804706236696
Correlation between ST_squared and Heart Disease: 0.3522291132977642


In [50]:
vessel_risk = data['Number of vessels fluro'] >= 2
data['vessel_risk'] = vessel_risk.astype(int)

correlation_vessel_risk = data['vessel_risk'].corr(data['Heart Disease'])
print(f"Correlation between vessel_risk and Heart Disease: {correlation_vessel_risk}")

Correlation between vessel_risk and Heart Disease: 0.33764295234549585


In [51]:
thallium_high_risk = data['Thallium'].isin([6, 7])
data['thallium_high_risk'] = thallium_high_risk.astype(int)
correlation_thallium_risk = data['thallium_high_risk'].corr(data['Heart Disease'])
print(f"Correlation between thallium_high_risk and Heart Disease: {correlation_thallium_risk}")

Correlation between thallium_high_risk and Heart Disease: 0.604886595879062


In [52]:
# compute stress and diagnostic scores (ensure boolean parts are converted to integers)
data['stress_score'] = (
    data['Exercise angina'].astype(int)
    + (data['ST depression'] > 1).astype(int)
    + (data['Slope of ST'] == 2).astype(int)
)

data['diagnostic_score'] = (
    (data['Number of vessels fluro'] >= 2).astype(int)
    + data['thallium_high_risk'].astype(int)
    + (data['EKG results'] != 0).astype(int)
)

correlation_stress_score = data['stress_score'].corr(data['Heart Disease'])
print(f"Correlation between stress_score and Heart Disease: {correlation_stress_score}")
correlation_diagnostic_score = data['diagnostic_score'].corr(data['Heart Disease'])
print(f"Correlation between diagnostic_score and Heart Disease: {correlation_diagnostic_score}")

Correlation between stress_score and Heart Disease: 0.5720860782812971
Correlation between diagnostic_score and Heart Disease: 0.5794538109383613


In [1]:
from flaml import AutoML