In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('Heart.csv')

In [None]:
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   Unnamed: 0  Age  Sex     ChestPain  RestBP  Chol  Fbs  RestECG  MaxHR  \
0           1   63    1       typical     145   233    1        2    150   
1           2   67    1  asymptomatic     160   286    0        2    108   
2           3   67    1  asymptomatic     120   229    0        2    129   
3           4   37    1    nonanginal     130   250    0        0    187   
4           5   41    0    nontypical     130   204    0        2    172   

   ExAng  Oldpeak  Slope   Ca        Thal  AHD  
0      0      2.3      3  0.0       fixed   No  
1      1      1.5      2  3.0      normal  Yes  
2      1      2.6      2  2.0  reversable  Yes  
3      0      3.5      3  0.0      normal   No  
4      0      1.4      1  0.0      normal   No  


In [None]:
# Data Cleaning
# Checking for any duplicates
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


In [None]:
# Removing duplicates
data_cleaned = data.drop_duplicates()

In [None]:
# Verify removal of duplicates
print(f"Number of rows after removing duplicates: {data_cleaned.shape[0]}")

Number of rows after removing duplicates: 303


Handing missing values

In [None]:
# Check for missing values
missing_values = data_cleaned.isnull().sum()
print(missing_values)

Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64


In [None]:
# Save the cleaned data to a new CSV file (optional)
data_cleaned.to_csv('heart_cleaned.csv', index=False)

In [None]:
data_cleaned.fillna(data.select_dtypes(include='number').mean(), inplace=True)


In [None]:
# Verify no missing values remain
missing_values_after = data_cleaned.isnull().sum()
print("Missing values in each column after handling:")
print(missing_values_after)

Missing values in each column after handling:
Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            0
Thal          2
AHD           0
dtype: int64


Data Transformation

In [None]:
print(data_cleaned.head())

   Unnamed: 0  Age  Sex     ChestPain  RestBP  Chol  Fbs  RestECG  MaxHR  \
0           1   63    1       typical     145   233    1        2    150   
1           2   67    1  asymptomatic     160   286    0        2    108   
2           3   67    1  asymptomatic     120   229    0        2    129   
3           4   37    1    nonanginal     130   250    0        0    187   
4           5   41    0    nontypical     130   204    0        2    172   

   ExAng  Oldpeak  Slope   Ca        Thal  AHD  
0      0      2.3      3  0.0       fixed   No  
1      1      1.5      2  3.0      normal  Yes  
2      1      2.6      2  2.0  reversable  Yes  
3      0      3.5      3  0.0      normal   No  
4      0      1.4      1  0.0      normal   No  


In [None]:
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
data_cleaned[numerical_features] = scaler.fit_transform(data_cleaned[numerical_features])


In [None]:
# Display the transformed data
print("Data after scaling:")
print(data_cleaned.head())


Data after scaling:
   Unnamed: 0       Age       Sex     ChestPain    RestBP      Chol       Fbs  \
0   -1.726344  0.948726  0.686202       typical  0.757525 -0.264900  2.394438   
1   -1.714911  1.392002  0.686202  asymptomatic  1.611220  0.760415 -0.417635   
2   -1.703478  1.392002  0.686202  asymptomatic -0.665300 -0.342283 -0.417635   
3   -1.692046 -1.932564  0.686202    nonanginal -0.096170  0.063974 -0.417635   
4   -1.680613 -1.489288 -1.457296    nontypical -0.096170 -0.825922 -0.417635   

    RestECG     MaxHR     ExAng   Oldpeak     Slope        Ca        Thal  AHD  
0  1.016684  0.017197 -0.696631  1.087338  2.274579 -0.723095       fixed   No  
1  1.016684 -1.821905  1.435481  0.397182  0.649113  2.503851      normal  Yes  
2  1.016684 -0.902354  1.435481  1.346147  0.649113  1.428203  reversable  Yes  
3 -0.996749  1.637359 -0.696631  2.122573  2.274579 -0.723095      normal   No  
4  1.016684  0.980537 -0.696631  0.310912 -0.976352 -0.723095      normal   No  


In [None]:
train_data, test_data = train_test_split(data_cleaned, test_size=0.25, random_state=42)

In [None]:
# Display the size of the training and testing sets
print("Training data size:", train_data.shape)
print("Testing data size:", test_data.shape)

Training data size: (227, 15)
Testing data size: (76, 15)
