In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

data = pd.read_csv('heart.csv')
print("first few rows : ")
data.head()

print("last few rows : ")
data.tail()

data.isnull()

duplicates = data.duplicated().sum()
print(f"number of duplicate rows : {duplicates}")

data_cleaned = data.drop_duplicates()
print(f"Number of rows after removing duplicates: {data_cleaned.shape[0]}")

missing_values = data_cleaned.isnull().sum()
print("Missing values in each column:")
print(missing_values)

if data_cleaned['Thal'].isnull().sum() > 0:
    mode_value = data_cleaned['Thal'].mode()[0]
    data_cleaned['Thal'] = data_cleaned['Thal'].fillna(mode_value)
if data_cleaned['Ca'].isnull().sum() > 0:
    median_value = data_cleaned['Ca'].median()
    data_cleaned['Ca'] = data_cleaned['Ca'].fillna(median_value)

missing_values_after = data_cleaned.isnull().sum()
print("Missing values in each column after handling:")
print(missing_values_after)

numerical_features = data_cleaned.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data_cleaned[numerical_features] = scaler.fit_transform(data_cleaned[numerical_features])
print("Data after scaling:")
data_cleaned.head()

train_data, test_data = train_test_split(data_cleaned, test_size=0.25, random_state=42)
print("Training data size:", train_data.shape)
print("Testing data size:", test_data.shape)

scaler = StandardScaler()
data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']])

scaler = MinMaxScaler()
data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(data[['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']])
print("\nColumns after transformation:")
print(data.columns)

data.head()

data['AgeGroup'] = pd.cut(data['Age'], bins=[-3, -1, 0, 2], labels=['Young', 'Middle-aged', 'Senior'])
data.head()


first few rows : 
last few rows : 
number of duplicate rows : 0
Number of rows after removing duplicates: 303
Missing values in each column:
Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64
Missing values in each column after handling:
Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            0
Thal          0
AHD           0
dtype: int64
Data after scaling:
Training data size: (227, 15)
Testing data size: (76, 15)

Columns after transformation:
Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD,AgeGroup
0,1,0.708333,1,typical,0.481132,0.244292,1,2,0.603053,0,0.370968,3,0.0,fixed,No,Senior
1,2,0.791667,1,asymptomatic,0.622642,0.365297,0,2,0.282443,1,0.241935,2,3.0,normal,Yes,Senior
2,3,0.791667,1,asymptomatic,0.245283,0.23516,0,2,0.442748,1,0.419355,2,2.0,reversable,Yes,Senior
3,4,0.166667,1,nonanginal,0.339623,0.283105,0,0,0.885496,0,0.564516,3,0.0,normal,No,Senior
4,5,0.25,0,nontypical,0.339623,0.178082,0,2,0.770992,0,0.225806,1,0.0,normal,No,Senior
