In [10]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


# =========================================================================
# 1. DATA LOADING AND CLEANING
# =========================================================================

# Fetch dataset
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets

# Replace -9 with NaN and Impute Missing Values
X = X.replace(-9, np.nan)
X['ca'] = X['ca'].fillna(X['ca'].mode()[0])
X['thal'] = X['thal'].fillna(X['thal'].mode()[0])
print("Missing values after imputation:\n", X.isnull().sum().sum())

# =========================================================================
# 2. DATA ENCODING AND SCALING
# =========================================================================

# One-Hot Encoding (for categorical features with numeric labels)
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Standard Scaling (for continuous numerical features)
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])
import pandas as pd
import numpy as np

import pandas as pd
import os

# Define file paths
INPUT_PATH = 'data/heart_disease_normalized.csv'
OUTPUT_DIR = 'data'
OUTPUT_FILENAME = 'heart_disease_model_ready.csv'
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)

# ----------------------------------------------------------------------
# 1. Load the Normalized Data
# ----------------------------------------------------------------------
try:
    df = pd.read_csv(INPUT_PATH)
    print(f"Data loaded successfully from {INPUT_PATH}. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: Input file not found at {INPUT_PATH}.")
    # Create a mock DataFrame for demonstration if the file doesn't exist
    print("Using mock data to demonstrate the cleaning process.")
    mock_data = {
        'age': [0.45, 0.70, 0.85, 0.50],
        'sex': [1, 0, 1, 0],  # Binary (1/0) features are fine as is
        'cp': [1, 3, 4, 2],   # Categorical feature needs encoding
        'chol': [0.62, 0.55, 0.75, 0.40],
        'thalach': [0.33, 0.55, 0.70, 0.88],
        'target': [0, 1, 0, 1]
    }
    df = pd.DataFrame(mock_data)

# ----------------------------------------------------------------------
# 2. Perform Final Cleaning: One-Hot Encoding
# ----------------------------------------------------------------------
# Identify categorical columns that are not binary (0/1)
# Based on the typical dataset, 'cp' (Chest Pain Type) is a key feature needing encoding.
categorical_features = ['cp'] 

print(f"\nApplying One-Hot Encoding to: {categorical_features}")
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True, dtype=int)

# Check the new shape and columns (e.g., 'cp_2', 'cp_3', 'cp_4' will be new)
print(f"Encoded data shape: {df_encoded.shape}")
print(f"New columns created: {[col for col in df_encoded.columns if 'cp_' in col]}")


# ----------------------------------------------------------------------
# 3. Save the Final Model-Ready Artifact
# ----------------------------------------------------------------------

# Ensure the 'data' directory exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created directory: {OUTPUT_DIR}")

# Save the fully cleaned, encoded DataFrame
df_encoded.to_csv(OUTPUT_PATH, index=False)

print("\n-------------------------------------------------------------")
print(f"✔️ Fully cleaned, model-ready dataset saved to: {OUTPUT_PATH}")
print("This file is now ready for Feature Selection and Model Training.")
print("-------------------------------------------------------------")



Missing values after imputation:
 0
Data loaded successfully from data/heart_disease_normalized.csv. Shape: (4, 6)

Applying One-Hot Encoding to: ['cp']
Encoded data shape: (4, 8)
New columns created: ['cp_2', 'cp_3', 'cp_4']

-------------------------------------------------------------
✔️ Fully cleaned, model-ready dataset saved to: data\heart_disease_model_ready.csv
This file is now ready for Feature Selection and Model Training.
-------------------------------------------------------------
