In [11]:
# ===============================
# Improved & Fixed Pipeline with Google Drive
# ===============================

# Step 0: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Step 2: Load dataset from Google Drive
df = pd.read_csv("/content/drive/MyDrive/heartAttack_clean.csv")

# Step 3: Separate features and target
X = df.drop("heart_attack", axis=1)
y = df["heart_attack"]

# Step 4: Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()

# Step 5: Preprocessing (One-hot encode categoricals, keep numeric as is)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ]
)

# Step 6: Split BEFORE any balancing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 7: Preprocess (fit on train, transform test)
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Step 8: Get feature names after encoding
cat_encoder = preprocessor.named_transformers_['cat']
cat_features = cat_encoder.get_feature_names_out(categorical_cols)
all_features = numeric_cols + list(cat_features)

X_train_encoded = pd.DataFrame(X_train_encoded, columns=all_features)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=all_features)


# Step 9: Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_encoded, y_train)

# Step 10: Feature selection (top 20 features)
selector = SelectKBest(score_func=f_classif, k=20)
X_train_selected = selector.fit_transform(X_train_bal, y_train_bal)
X_test_selected = selector.transform(X_test_encoded)

# Step 11: Get selected feature names
selected_features = [all_features[i] for i in selector.get_support(indices=True)]


# Step 12: Create final DataFrames with selected_features (top 20)
X_train_final = pd.DataFrame(X_train_bal[selected_features], columns=selected_features)
X_test_final = pd.DataFrame(X_test_selected, columns=selected_features)


# Step 13: Convert integer-like columns (e.g., age) back to int
for col in numeric_cols:
    if col in selected_features:
        # Ensure the column exists in the dataframe before attempting conversion
        if col in X_train_final.columns:
            X_train_final[col] = X_train_final[col].round().astype(int)
        if col in X_test_final.columns:
             X_test_final[col] = X_test_final[col].round().astype(int)


# Step 14: Show summary (before adding gender_Male)
print("Initial training shape (Top 20 features):", X_train_final.shape)
print("Selected features (Top 20):", selected_features)


# Step 15: Save processed training dataset to Google Drive (before adding gender_Male)
# final_train = pd.concat([X_train_final, y_train_bal.reset_index(drop=True)], axis=1)
# final_train.to_csv("/content/drive/MyDrive/heart_attack_train_processed.csv", index=False)

# Step 16: Add 'gender_Male' column to X_train_final and X_test_final
if 'gender_Male' in X_train_bal.columns and 'gender_Male' not in X_train_final.columns:
    X_train_final['gender_Male'] = X_train_bal['gender_Male']
if 'gender_Male' in X_test_encoded.columns and 'gender_Male' not in X_test_final.columns:
    X_test_final['gender_Male'] = X_test_encoded['gender_Male']


# Step 17: Update selected features list to include gender_Male
if 'gender_Male' not in selected_features and 'gender_Male' in X_train_final.columns:
    selected_features.append('gender_Male')


# Step 18: Show final summary
print("\nFinal training shape (with gender_Male):", X_train_final.shape)
print("Final selected features:", selected_features)


# Step 19: Save final processed training dataset to Google Drive
final_train = pd.concat([X_train_final, y_train_bal.reset_index(drop=True)], axis=1)
final_train.to_csv("/content/drive/MyDrive/heart_attack_train_processed.csv", index=False)

# Step 20: Download to local machine (optional)
from google.colab import files
files.download("/content/drive/MyDrive/heart_attack_train_processed.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial training shape (Top 20 features): (151768, 20)
Selected features (Top 20): ['age', 'hypertension', 'diabetes', 'cholesterol_level', 'obesity', 'waist_circumference', 'sleep_hours', 'fasting_blood_sugar', 'triglycerides', 'previous_heart_disease', 'medication_usage', 'region_Urban', 'income_level_middle', 'smoking_status_Never', 'smoking_status_Past', 'smoking_status_Unknown', 'physical_activity_Low', 'stress_level_Moderate', 'stress_level_moderate', 'EKG_results_Normal']

Final training shape (with gender_Male): (151768, 21)
Final selected features: ['age', 'hypertension', 'diabetes', 'cholesterol_level', 'obesity', 'waist_circumference', 'sleep_hours', 'fasting_blood_sugar', 'triglycerides', 'previous_heart_disease', 'medication_usage', 'region_Urban', 'income_level_middle', 'smoking_status_Never', 'smoking_status_Past', 'smoking_status_Unknown', 'ph

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>