# Sprintup Project - 1. Data Preprocessing & Cleaning

In [5]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Step 1: fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
#Step 2: droping rows with missing values
X= X.dropna()
y = y.loc[X.index]
df=pd.concat([X,y],axis=1)

print(df.info())



<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    int64  
 1   sex       297 non-null    int64  
 2   cp        297 non-null    int64  
 3   trestbps  297 non-null    int64  
 4   chol      297 non-null    int64  
 5   fbs       297 non-null    int64  
 6   restecg   297 non-null    int64  
 7   thalach   297 non-null    int64  
 8   exang     297 non-null    int64  
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    int64  
 11  ca        297 non-null    float64
 12  thal      297 non-null    float64
 13  num       297 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 34.8 KB
None


In [7]:
# Step 3: Encode categorical variables
# We convert categorical columns into numeric format using one-hot encoding.
# This is necessary for linear models (like Linear Regression or Logistic Regression),
# which cannot handle non-numeric data. `drop_first=True` avoids the dummy variable trap.
categorical_cols = ["cp", "restecg", "slope", "thal"]
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Step 4: Convert boolean columns to integers
# Boolean columns (True/False) are converted to integers (1/0)
# because most machine learning models require numeric input.
X_encoded = X_encoded.astype(int)

#checking the final encoded dataframe
X_encoded.head() 
      



Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_2,cp_3,cp_4,restecg_1,restecg_2,slope_2,slope_3,thal_6.0,thal_7.0
0,63,1,145,233,1,150,0,2,0,0,0,0,0,1,0,1,1,0
1,67,1,160,286,0,108,1,1,3,0,0,1,0,1,1,0,0,0
2,67,1,120,229,0,129,1,2,2,0,0,1,0,1,1,0,0,1
3,37,1,130,250,0,187,0,3,0,0,1,0,0,0,0,1,0,0
4,41,0,130,204,0,172,0,1,0,1,0,0,0,1,0,0,0,0


In [8]:
# Step 5: Scale numerical features
# Standardize continuous columns to have mean = 0 and standard deviation = 1.
# This helps models (especially linear models) perform better by putting all numeric features on the same scale.
    
X_encoded_scaled = X_encoded
scaled=StandardScaler()
cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
X_encoded_scaled[cols] = scaled.fit_transform(X_encoded_scaled[cols])
X_encoded_scaled=X_encoded_scaled.reset_index(drop=True)
y=y.reset_index(drop=True)
#now our dataset is more suitable for linear models
df_s_e = pd.concat([X_encoded_scaled, y], axis=1)  
df_s_e.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_2,cp_3,cp_4,restecg_1,restecg_2,slope_2,slope_3,thal_6.0,thal_7.0,num
0,0.936181,1,0.75038,-0.276443,1,0.017494,0,1.136764,0,0,0,0,0,1,0,1,1,0,0
1,1.378929,1,1.596266,0.744555,0,-1.816334,1,0.206684,3,0,0,1,0,1,1,0,0,0,2
2,1.378929,1,-0.659431,-0.3535,0,-0.89942,1,1.136764,2,0,0,1,0,1,1,0,0,1,1
3,-1.94168,1,-0.095506,0.051047,0,1.63301,0,2.066844,0,0,1,0,0,0,0,1,0,0,0
4,-1.498933,0,-0.095506,-0.835103,0,0.978071,0,0.206684,0,1,0,0,0,1,0,0,0,0,0


# Now, we have a Cleaned datasets that are ready for modeling 
