In [1]:
# Importing Standard libraries for data manipulation and visualization.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data
df = pd.read_csv("../data/processed/heart.csv")

In [3]:
# Copy to avoid messing up the original while experimenting
df_clean = df.copy()

print("Original Shape:", df_clean.shape)
df_clean.head()

Original Shape: (967, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [6]:
# Identify categorical columns that are currently numbers
cat_cols = ['cp', 'thal', 'slope']

# Create dummy variables (One-Hot Encoding)
# drop_first=True prevents multicollinearity (redundancy)
df_clean = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print(f"Columns increased from {df.shape[1]} to {df_clean.shape[1]}")
df_clean.head()

Columns increased from 14 to 18


Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,condition,cp_1,cp_2,cp_3,thal_1,thal_2,slope_1,slope_2
0,69,1,160,234,1,2,131,0,0.1,1,0,False,False,False,False,False,True,False
1,69,0,140,239,0,0,151,0,1.8,2,0,False,False,False,False,False,False,False
2,66,0,150,226,0,0,114,0,2.6,0,0,False,False,False,False,False,False,True
3,65,1,138,282,1,2,174,0,1.4,1,1,False,False,False,False,False,True,False
4,64,1,110,211,0,2,144,1,1.8,0,0,False,False,False,False,False,True,False


In [None]:
def cap_outliers(series):
    upper = series.quantile(0.99)
    lower = series.quantile(0.01)
    return np.where(series > upper, upper, 
                    np.where(series < lower, lower, series))

# Cap the continuous variables
for col in ['chol', 'trestbps', 'thalach']:
    df_clean[col] = cap_outliers(df_clean[col])

print("Outliers capped. Max Cholesterol is now:", df_clean['chol'].max())

Outliers capped. Max Cholesterol is now: 368.0200000000001


In [8]:
# Select columns to scale (exclude the binary targets/dummies)
cols_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

scaler = StandardScaler()

# Apply Z-Score Standardization
df_clean[cols_to_scale] = scaler.fit_transform(df_clean[cols_to_scale])

# Verify: Means should be ~0, Std Dev ~1
df_clean[cols_to_scale].describe().round(2)

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
count,967.0,967.0,967.0,967.0,967.0
mean,0.0,0.0,0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0
min,-1.93,-2.15,-2.29,-2.56,-1.2
25%,-0.77,-0.67,-0.69,-0.69,-0.8
50%,0.08,-0.03,-0.07,0.06,-0.2
75%,0.77,0.63,0.64,0.65,0.59
max,1.77,2.62,2.59,2.13,4.96


In [9]:
# # We use the original dataframe 'df' for logic, because 'df_clean' is already scaled!
# def calculate_risk(row):
#     risk_points = 0
#     if row['age'] > 60: risk_points += 1
#     if row['trestbps'] > 140: risk_points += 1
#     if row['chol'] > 240: risk_points += 1
#     return risk_points

# # Create the new feature
# df_clean['risk_score'] = df.apply(calculate_risk, axis=1)

# # Check correlation: Does this new feature predict the disease?
# print("Correlation of Risk Score with Target:")
# print(df_clean[['risk_score', 'condition']].corr())

In [10]:
# Save to processed folder
df_clean.to_csv('../data/processed/heart_processed.csv', index=False)
print("File saved to data/processed/heart_processed.csv")

File saved to data/processed/heart_processed.csv


In [11]:
import pickle
# Save the scaler from your preprocessing step
with open("../models/scaler.pkl", 'wb') as file:
    pickle.dump(scaler, file)
print("Scaler saved! ✅")

Scaler saved! ✅
