In [4]:
import pandas as pd

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [6]:
data = pd.read_csv('heart.csv')

In [7]:
print("first few rows : ")
data.head()


first few rows : 


Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [8]:
print("last few rows : ")
data.tail()

last few rows : 


Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
298,299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
301,302,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes
302,303,38,1,nonanginal,138,175,0,0,173,0,0.0,1,,normal,No


In [9]:
data.isnull()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
299,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
300,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
301,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [10]:
duplicates = data.duplicated().sum()
print(f"number of duplicate rows : {duplicates}")


number of duplicate rows : 0


In [11]:
data_cleaned = data.drop_duplicates()
print(f"Number of rows after removing duplicates: {data_cleaned.shape[0]}")


Number of rows after removing duplicates: 303


In [12]:
missing_values = data_cleaned.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64


In [14]:
if data_cleaned['Thal'].isnull().sum() > 0:
    mode_value = data_cleaned['Thal'].mode()[0]
    data_cleaned['Thal'] = data_cleaned['Thal'].fillna(mode_value)

In [17]:
if data_cleaned['Ca'].isnull().sum() > 0:
    median_value = data_cleaned['Ca'].median()
    data_cleaned['Ca'] = data_cleaned['Ca'].fillna(median_value)


In [18]:
missing_values_after = data_cleaned.isnull().sum()
print("Missing values in each column after handling:")
print(missing_values_after)


Missing values in each column after handling:
Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            0
Thal          0
AHD           0
dtype: int64


In [19]:
numerical_features = data_cleaned.select_dtypes(include=['int64', 'float64']).columns

In [20]:
scaler = StandardScaler()

In [21]:
data_cleaned[numerical_features] = scaler.fit_transform(data_cleaned[numerical_features])

In [22]:
print("Data after scaling:")
data_cleaned.head()

Data after scaling:


Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,-1.726344,0.948726,0.686202,typical,0.757525,-0.2649,2.394438,1.016684,0.017197,-0.696631,1.087338,2.274579,-0.711131,fixed,No
1,-1.714911,1.392002,0.686202,asymptomatic,1.61122,0.760415,-0.417635,1.016684,-1.821905,1.435481,0.397182,0.649113,2.504881,normal,Yes
2,-1.703478,1.392002,0.686202,asymptomatic,-0.6653,-0.342283,-0.417635,1.016684,-0.902354,1.435481,1.346147,0.649113,1.432877,reversable,Yes
3,-1.692046,-1.932564,0.686202,nonanginal,-0.09617,0.063974,-0.417635,-0.996749,1.637359,-0.696631,2.122573,2.274579,-0.711131,normal,No
4,-1.680613,-1.489288,-1.457296,nontypical,-0.09617,-0.825922,-0.417635,1.016684,0.980537,-0.696631,0.310912,-0.976352,-0.711131,normal,No


In [23]:
train_data, test_data = train_test_split(data_cleaned, test_size=0.25, random_state=42)

In [24]:
print("Training data size:", train_data.shape)
print("Testing data size:", test_data.shape)

Training data size: (227, 15)
Testing data size: (76, 15)


In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']])


In [29]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']])


In [31]:
print("\nColumns after transformation:")
print(data.columns)


Columns after transformation:
Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')


In [32]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,0.708333,1,typical,0.481132,0.244292,1,2,0.603053,0,0.370968,3,0.0,fixed,No
1,2,0.791667,1,asymptomatic,0.622642,0.365297,0,2,0.282443,1,0.241935,2,3.0,normal,Yes
2,3,0.791667,1,asymptomatic,0.245283,0.23516,0,2,0.442748,1,0.419355,2,2.0,reversable,Yes
3,4,0.166667,1,nonanginal,0.339623,0.283105,0,0,0.885496,0,0.564516,3,0.0,normal,No
4,5,0.25,0,nontypical,0.339623,0.178082,0,2,0.770992,0,0.225806,1,0.0,normal,No


In [34]:
data['AgeGroup'] = pd.cut(data['Age'], bins=[-3, -1, 0, 2], labels=['Young', 'Middle-aged', 'Senior'])


In [35]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD,AgeGroup
0,1,0.708333,1,typical,0.481132,0.244292,1,2,0.603053,0,0.370968,3,0.0,fixed,No,Senior
1,2,0.791667,1,asymptomatic,0.622642,0.365297,0,2,0.282443,1,0.241935,2,3.0,normal,Yes,Senior
2,3,0.791667,1,asymptomatic,0.245283,0.23516,0,2,0.442748,1,0.419355,2,2.0,reversable,Yes,Senior
3,4,0.166667,1,nonanginal,0.339623,0.283105,0,0,0.885496,0,0.564516,3,0.0,normal,No,Senior
4,5,0.25,0,nontypical,0.339623,0.178082,0,2,0.770992,0,0.225806,1,0.0,normal,No,Senior
