In [6]:
import pandas as pd
import numpy as np

# Cleaning
# Load data
path = "/Users/tingwei/Downloads/archive/heart_statlog_cleveland_hungary_final.csv"
df = pd.read_csv(path)
df

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1185,45,1,1,110,264,0,0,132,0,1.2,2,1
1186,68,1,4,144,193,1,0,141,0,3.4,2,1
1187,57,1,4,130,131,0,0,115,1,1.2,2,1
1188,57,0,2,130,236,0,2,174,0,0.0,2,1


In [3]:
print("Original shape:", df.shape)
print("Columns:", df.columns.tolist())

# Basic quality checks
print("\nMissing values per column:\n", df.isna().sum())
dup_count = df.duplicated().sum()
print("\nDuplicated rows:", dup_count)

# Remove duplicates
df = df.drop_duplicates().reset_index(drop=True)
print("\nAfter dropping duplicates:", df.shape)

# Validate target
target_col = "target"
print("\nTarget value counts:\n", df[target_col].value_counts())
assert set(df[target_col].unique()).issubset({0, 1}), "Target is not binary {0,1}!"

# Handle implicit missing values
zero_as_missing_cols = ["cholesterol", "resting bp s", "ST slope"]

for c in zero_as_missing_cols:
    if c in df.columns:
        zero_count = (df[c] == 0).sum()
        print(f"\n{c}: zero count before = {zero_count}")
        df.loc[df[c] == 0, c] = np.nan

print("\nMissing values after converting zeros to NaN:\n", df.isna().sum())

# Imputation
# Decide which columns are continuous vs categorical
continuous_cols = ["age", "resting bp s", "cholesterol", "max heart rate", "oldpeak"]
categorical_cols = ["sex", "chest pain type", "fasting blood sugar",
                    "resting ecg", "exercise angina", "ST slope"]

# Median imputation for continuous
for c in continuous_cols:
    if c in df.columns:
        med = df[c].median()
        df[c] = df[c].fillna(med)

# Mode imputation for categorical/discrete
for c in categorical_cols:
    if c in df.columns:
        mode = df[c].mode(dropna=True)[0]
        df[c] = df[c].fillna(mode)

# Ensure categorical cols are int-coded
for c in categorical_cols + [target_col]:
    if c in df.columns:
        df[c] = df[c].astype(int)

print("\nFinal missing values per column:\n", df.isna().sum())
print("\nFinal shape:", df.shape)
print("\nData types:\n", df.dtypes)


# EDA
import os
import matplotlib.pyplot as plt

target_col = "target"

OUT_DIR = "eda_figures"
os.makedirs(OUT_DIR, exist_ok=True)

def save_fig(filename: str):
    path = os.path.join(OUT_DIR, filename)
    plt.tight_layout()
    plt.savefig(path, dpi=200, bbox_inches="tight")
    plt.close()
    print("Saved:", path)

# Target class distribution
counts = df[target_col].value_counts().sort_index()

plt.figure(figsize=(5, 4))
plt.bar(counts.index.astype(str), counts.values)
plt.title("Target Class Distribution")
plt.xlabel("target (0=no disease, 1=disease)")
plt.ylabel("count")
save_fig("fig1_target_distribution.png")

# Continuous feature histograms
continuous_cols = ["age", "resting bp s", "cholesterol", "max heart rate", "oldpeak"]
continuous_cols = [c for c in continuous_cols if c in df.columns]

n = len(continuous_cols)
cols = 3
rows = int(np.ceil(n / cols))

plt.figure(figsize=(12, 4 * rows))
for i, c in enumerate(continuous_cols, 1):
    ax = plt.subplot(rows, cols, i)
    ax.hist(df[c], bins=30)
    ax.set_title(c)
    ax.set_xlabel(c)
    ax.set_ylabel("count")

plt.suptitle("Continuous Feature Distributions", y=1.02)
save_fig("fig2_continuous_histograms.png")

# oldpeak by target
feature = "oldpeak"
if feature in df.columns:
    data0 = df[df[target_col] == 0][feature]
    data1 = df[df[target_col] == 1][feature]

    plt.figure(figsize=(6, 4))
    plt.boxplot([data0, data1], labels=["target=0", "target=1"])
    plt.title("oldpeak by Target Class")
    plt.ylabel(feature)
    save_fig("fig3_oldpeak_by_target.png")
else:
    print("Warning: oldpeak not found, skip FIG 3")

# max heart rate by target (boxplot)
feature = "max heart rate"
if feature in df.columns:
    data0 = df[df[target_col] == 0][feature]
    data1 = df[df[target_col] == 1][feature]

    plt.figure(figsize=(6, 4))
    plt.boxplot([data0, data1], labels=["target=0", "target=1"])
    plt.title("max heart rate by Target Class")
    plt.ylabel(feature)
    save_fig("fig4_maxheartrate_by_target.png")

# Disease rate by exercise angina
cat = "exercise angina"
if cat in df.columns:
    rates = df.groupby(cat)[target_col].mean().sort_index()
    counts = df[cat].value_counts().sort_index()

    plt.figure(figsize=(6, 4))
    plt.bar(rates.index.astype(str), rates.values)
    plt.title("Disease Rate by Exercise Angina")
    plt.xlabel(cat + " (0=no, 1=yes)")
    plt.ylabel("P(target=1)")

    for i, (k, v) in enumerate(rates.items()):
        plt.text(i, v, f"n={counts.loc[k]}", ha="center", va="bottom", fontsize=9)

    save_fig("fig5_exerciseangina_disease_rate.png")

print("\nDone. Figures saved in:", OUT_DIR)

Original shape: (918, 12)
Columns: ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'target']

Missing values per column:
 age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64

Duplicated rows: 0

After dropping duplicates: (918, 12)

Target value counts:
 target
1    508
0    410
Name: count, dtype: int64

cholesterol: zero count before = 0

resting bp s: zero count before = 0

ST slope: zero count before = 0

Missing values after converting zeros to NaN:
 age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg   

  plt.boxplot([data0, data1], labels=["target=0", "target=1"])


Saved: eda_figures/fig3_oldpeak_by_target.png


  plt.boxplot([data0, data1], labels=["target=0", "target=1"])


Saved: eda_figures/fig4_maxheartrate_by_target.png
Saved: eda_figures/fig5_exerciseangina_disease_rate.png

Done. Figures saved in: eda_figures


In [None]:
# =============================================================================
# Feature Engineering
# =============================================================================

from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Save original data copy for feature creation
df_original = df.copy()

# -----------------------------------------------------------------------------
# 1. Feature Creation - Before scaling
# -----------------------------------------------------------------------------

# Age groups (based on heart disease risk age brackets)
df['age_group'] = pd.cut(df_original['age'], bins=[0, 40, 55, 70, 100], labels=[0, 1, 2, 3]).astype(int)

# Heart rate reserve (max predicted heart rate - actual max heart rate)
df['heart_rate_reserve'] = (220 - df_original['age']) - df_original['max heart rate']

# Cholesterol risk level (>200 is high risk)
df['chol_risk'] = (df_original['cholesterol'] > 200).astype(int)

# Blood pressure risk (>140 is hypertension)
df['bp_risk'] = (df_original['resting bp s'] > 140).astype(int)

# Oldpeak abnormal indicator
df['oldpeak_abnormal'] = (df_original['oldpeak'] > 0).astype(int)

print("New features created:")
print("- age_group: Age grouping (0=<40, 1=40-55, 2=55-70, 3=>70)")
print("- heart_rate_reserve: Heart rate reserve")
print("- chol_risk: Cholesterol risk (1=high)")
print("- bp_risk: Blood pressure risk (1=high)")
print("- oldpeak_abnormal: ST depression abnormal")

print("\nShape after feature creation:", df.shape)

In [None]:
# -----------------------------------------------------------------------------
# 2. Categorical Variable One-Hot Encoding
# -----------------------------------------------------------------------------

# Columns to One-Hot encode (multi-class)
onehot_cols = ['chest pain type', 'ST slope', 'resting ecg']

print("Before One-Hot encoding:", df.shape)
print("Columns to encode:", onehot_cols)

for col in onehot_cols:
    print(f"  {col}: {df[col].unique()}")

# One-Hot encoding, drop_first=True to avoid multicollinearity
df = pd.get_dummies(df, columns=onehot_cols, drop_first=True)

print("\nAfter One-Hot encoding:", df.shape)
print("New columns:", [c for c in df.columns if any(oc in c for oc in onehot_cols)])

In [None]:
# -----------------------------------------------------------------------------
# 3. Feature Scaling
# -----------------------------------------------------------------------------

# Continuous variables to scale
continuous_cols = ["age", "resting bp s", "cholesterol", "max heart rate", "oldpeak", "heart_rate_reserve"]

print("Scaling continuous features:", continuous_cols)
print("\nBefore scaling - statistics:")
print(df[continuous_cols].describe().round(2))

# Use StandardScaler (Z-score normalization)
scaler = StandardScaler()
df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

print("\nAfter scaling - statistics:")
print(df[continuous_cols].describe().round(2))

In [None]:
# -----------------------------------------------------------------------------
# 4. Correlation Analysis
# -----------------------------------------------------------------------------

# Compute correlation matrix
corr_matrix = df.corr()

# Correlation with target
target_corr = corr_matrix['target'].drop('target').sort_values(key=abs, ascending=False)
print("Feature correlation with target (sorted by |r|):\n")
print(target_corr.round(3))

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', 
            center=0, square=True, linewidths=0.5,
            annot_kws={'size': 8})
plt.title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.savefig('eda_figures/fig6_correlation_matrix.png', dpi=200, bbox_inches='tight')
plt.show()
print("\nSaved: eda_figures/fig6_correlation_matrix.png")

In [None]:
# -----------------------------------------------------------------------------
# 5. 特征选择 (Feature Selection)
# -----------------------------------------------------------------------------

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# 准备 X 和 y
X = df.drop('target', axis=1)
y = df['target']

print(f"Total features: {X.shape[1]}")
print(f"Feature names: {X.columns.tolist()}\n")

# 使用 ANOVA F-test 评估特征重要性
selector = SelectKBest(f_classif, k='all')
selector.fit(X, y)

# 特征得分排名
feature_scores = pd.DataFrame({
    'feature': X.columns,
    'f_score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values('f_score', ascending=False)

print("Feature Importance (ANOVA F-test):\n")
print(feature_scores.round(4))

# 可视化
plt.figure(figsize=(10, 8))
plt.barh(feature_scores['feature'], feature_scores['f_score'])
plt.xlabel('F-Score')
plt.title('Feature Importance (ANOVA F-test)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('eda_figures/fig7_feature_importance.png', dpi=200, bbox_inches='tight')
plt.show()
print("\nSaved: eda_figures/fig7_feature_importance.png")

In [None]:
# -----------------------------------------------------------------------------
# 6. Save Processed Data & Summary
# -----------------------------------------------------------------------------

# Select significant features (optional: based on p_value < 0.05)
significant_features = feature_scores[feature_scores['p_value'] < 0.05]['feature'].tolist()
print(f"Significant features (p < 0.05): {len(significant_features)}")
print(significant_features)

# Final dataset summary
print("\n" + "="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)
print(f"Original features: 11")
print(f"Final features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")
print(f"\nTarget distribution:")
print(f"  - Class 0 (No disease): {(y == 0).sum()} ({(y == 0).mean()*100:.1f}%)")
print(f"  - Class 1 (Disease): {(y == 1).sum()} ({(y == 1).mean()*100:.1f}%)")

# Save processed data
output_path = "heart_disease_processed.csv"
df.to_csv(output_path, index=False)
print(f"\nProcessed data saved to: {output_path}")

# Display final dataset preview
print("\nFinal dataset preview:")
df.head()