In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve)
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [2]:
print("=" * 60)
print("STEP 1: Loading Data")
print("=" * 60)

# Load training and test datasets
# Replace these paths with your actual file paths
train_df = pd.read_csv(r'D:\5th sem\EM06_Sindorai\AI\Heart_Attack_training_dataset.csv')
test_df = pd.read_csv(r'D:\5th sem\EM06_Sindorai\AI\Hear_Attack_evaluation_dataset.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("\nFirst few rows of training data:")
print(train_df.head())

STEP 1: Loading Data
Training data shape: (7963, 26)
Test data shape: (800, 25)

First few rows of training data:
  patient_id  age     sex  chol       bp  hr  diabetes  family_history  \
0    BMW7812   67    Male   208   158/88  72         0               0   
1    CZE1114   21    Male   389   165/93  98         1               1   
2    BNI9906   21  Female   324   174/99  72         1               0   
3    JLN3497   84    Male   383  163/100  73         1               1   
4    GFO8847   66    Male   318    91/88  93         1               1   

   smoking  obesity  ...  sedentary_hr  income        bmi  triglycerides  \
0        1        0  ...      6.615001  261404  31.251233            286   
1        1        1  ...      4.963459  285768  27.194973            235   
2        0        0  ...      9.463426  235282  28.176571            587   
3        1        0  ...      7.648981  125640  36.464704            378   
4        1        1  ...      1.514821  160555  21.809144    

In [2]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
print("\n" + "=" * 60)
print("STEP 2: Exploratory Data Analysis")
print("=" * 60)

# Check data types and missing values
print("\nData Info:")
print(train_df.info())

print("\nMissing Values:")
print(train_df.isnull().sum())

print("\nBasic Statistics:")
print(train_df.describe())

# Check target variable distribution
print("\nTarget Variable Distribution:")
print(train_df['heart_attack_risk'].value_counts())
print(f"Class balance: {train_df['heart_attack_risk'].value_counts(normalize=True)}")



STEP 2: Exploratory Data Analysis

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7963 entries, 0 to 7962
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patient_id         7963 non-null   object 
 1   age                7963 non-null   int64  
 2   sex                7963 non-null   object 
 3   chol               7963 non-null   int64  
 4   bp                 7963 non-null   object 
 5   hr                 7963 non-null   int64  
 6   diabetes           7963 non-null   int64  
 7   family_history     7963 non-null   int64  
 8   smoking            7963 non-null   int64  
 9   obesity            7963 non-null   int64  
 10  alcohol            7963 non-null   int64  
 11  exercise_hr_wk     7963 non-null   float64
 12  diet               7963 non-null   object 
 13  prev_heart_prob    7963 non-null   int64  
 14  med_use            7963 non-null   int64  
 15  stress_lvl         7963 n

In [4]:
def preprocess_data(df, is_train=True):
    """
    Robust preprocessing:
      - Safe BP parse (coerce bad/missing)
      - Impute numeric/categorical
      - Encode categorical (with safe fallback)
      - Frequency-encode geo fields
      - Ensure risk columns exist and are numeric/binary
    """
    df = df.copy()

    # ---- Safe BP parsing ----
    # Accepts "120/80", " 120 / 80 ", floats-as-strings; coerces bad to NaN
    if 'bp' in df.columns:
        bp_parsed = df['bp'].astype(str).str.extract(
            r'^\s*(?P<systolic>\d+\.?\d*)\s*/\s*(?P<diastolic>\d+\.?\d*)\s*$'
        )
        df['systolic_bp'] = pd.to_numeric(bp_parsed['systolic'], errors='coerce')
        df['diastolic_bp'] = pd.to_numeric(bp_parsed['diastolic'], errors='coerce')
        df = df.drop(columns=['bp'], errors='ignore')

    # ---- Ensure risk columns exist (fill missing with 0) ----
    risk_cols = ['diabetes', 'smoking', 'obesity', 'family_history',
                 'prev_heart_prob', 'alcohol']
    for c in risk_cols:
        if c not in df.columns:
            df[c] = 0
        # coerce to numeric (e.g., "Yes"/"No" or True/False)
        df[c] = pd.to_numeric(df[c].map({'Yes':1, 'No':0, True:1, False:0}).fillna(df[c]), errors='coerce')

    # ---- Basic imputations by type ----
    # Identify numerical columns (avoid target in train)
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if is_train and 'heart_attack_risk' in num_cols:
        num_cols.remove('heart_attack_risk')

    # Categorical columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Categorical fill (mode; handle all-NaN with 'Unknown')
    for c in cat_cols:
        mode_val = df[c].mode(dropna=True)
        fill_val = mode_val.iloc[0] if not mode_val.empty else 'Unknown'
        df[c] = df[c].fillna(fill_val)

    # Binary encode 'sex'
    if 'sex' in df.columns:
        df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})
        # If unexpected values existed, they are NaN now; fill with mode (default 0)
        df['sex'] = df['sex'].fillna(df['sex'].mode(dropna=True).iloc[0] if not df['sex'].mode(dropna=True).empty else 0)

    # Ordinal encode 'diet'
    if 'diet' in df.columns:
        diet_mapping = {'Poor': 0, 'Average': 1, 'Healthy': 2}
        df['diet'] = df['diet'].map(diet_mapping)
        df['diet'] = pd.to_numeric(df['diet'], errors='coerce')

    # Geo freq-encoding (train/test safe)
    for col in ['country', 'continent', 'hemisphere']:
        if col in df.columns:
            freq = df[col].value_counts(normalize=True)
            df[col + '_freq'] = df[col].map(freq)
            df = df.drop(columns=[col], errors='ignore')

    # Numeric impute (median)
    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].median())

    return df



In [5]:
# Apply preprocessing
train_processed = preprocess_data(train_df, is_train=True)
test_processed = preprocess_data(test_df, is_train=False)


In [6]:
print(f"\nProcessed training data shape: {train_processed.shape}")
print(f"Processed test data shape: {test_processed.shape}")


Processed training data shape: (7963, 27)
Processed test data shape: (800, 26)


In [7]:
# ==========================================
# STEP 4: FEATURE ENGINEERING
# ==========================================
print("\n" + "=" * 60)
print("STEP 4: Feature Engineering")
print("=" * 60)

def create_features(df):
    """
    Create domain features and ensure numeric dtypes.
    """
    df = df.copy()

    # Guard rails: if these exist but still have NaNs, fill before ops
    for col in ['systolic_bp', 'diastolic_bp', 'exercise_hr_wk', 'phys_act_days',
                'sleep_hr', 'sedentary_hr', 'stress_lvl', 'age', 'bmi']:
        if col in df.columns and df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())

    # Pulse pressure
    if {'systolic_bp', 'diastolic_bp'}.issubset(df.columns):
        df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    else:
        df['pulse_pressure'] = 0.0

    # Composite health score
    for needed in ['exercise_hr_wk','phys_act_days','sleep_hr','sedentary_hr','stress_lvl']:
        if needed not in df.columns:
            df[needed] = 0.0
    df['health_score'] = (
        df['exercise_hr_wk'] +
        df['phys_act_days'] +
        df['sleep_hr'] -
        df['sedentary_hr'] -
        df['stress_lvl']
    )

    # Risk factor count (coerce common yes/no/boolean to 0/1)
    risk_cols = ['diabetes', 'smoking', 'obesity', 'family_history',
                 'prev_heart_prob', 'alcohol']
    for c in risk_cols:
        if c in df.columns:
            df[c] = (
                df[c]
                .replace({True: 1, False: 0, 'Yes': 1, 'No': 0, 'Y': 1, 'N': 0, 'yes': 1, 'no': 0})
            )
            df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)
        else:
            df[c] = 0
    df['risk_factor_count'] = df[risk_cols].sum(axis=1)

    # Age groups & BMI categories (ensure int, no NaNs)
    if 'age' in df.columns:
        df['age_group'] = pd.cut(
            df['age'],
            bins=[0, 35, 50, 65, np.inf],
            labels=[0, 1, 2, 3],
            include_lowest=True,
            right=True
        ).astype(int)
    else:
        df['age_group'] = 0

    if 'bmi' in df.columns:
        df['bmi_category'] = pd.cut(
            df['bmi'],
            bins=[0, 18.5, 25, 30, np.inf],
            labels=[0, 1, 2, 3],
            include_lowest=True,
            right=True
        ).astype(int)
    else:
        df['bmi_category'] = 0

    # Replace inf just in case
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    return df




STEP 4: Feature Engineering


In [8]:
# Apply feature engineering
train_processed = create_features(train_processed)
test_processed = create_features(test_processed)

print("New features created:")
print("- pulse_pressure: Systolic - Diastolic BP")
print("- health_score: Composite lifestyle score")
print("- risk_factor_count: Total number of risk factors")
print("- age_group: Age categorization")
print("- bmi_category: BMI classification")

New features created:
- pulse_pressure: Systolic - Diastolic BP
- health_score: Composite lifestyle score
- risk_factor_count: Total number of risk factors
- age_group: Age categorization
- bmi_category: BMI classification


In [9]:
# ==========================================
# STEP 5: PREPARE DATA FOR MODELING
# ==========================================
print("\n" + "=" * 60)
print("STEP 5: Preparing Data for Modeling")
print("=" * 60)

# Separate features and target
X = train_processed.drop(['patient_id', 'heart_attack_risk'], axis=1)
y = train_processed['heart_attack_risk']

# Store test patient IDs for final submission
test_ids = test_processed['patient_id']
X_test = test_processed.drop('patient_id', axis=1)

# Ensure both datasets have same columns
missing_cols = set(X.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0

X_test = X_test[X.columns]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Test features shape: {X_test.shape}")

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")


STEP 5: Preparing Data for Modeling
Features shape: (7963, 30)
Target shape: (7963,)
Test features shape: (800, 30)

Training set: (6370, 30)
Validation set: (1593, 30)


In [10]:
print("\n" + "=" * 60)
print("STEP 5.5: Final Imputation & Sanity Checks (pre-SMOTE)")
print("=" * 60)

from sklearn.impute import SimpleImputer

# Ensure all features are numeric
X = X.apply(pd.to_numeric, errors='coerce')
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_val   = X_val.apply(pd.to_numeric, errors='coerce')
X_test  = X_test.apply(pd.to_numeric, errors='coerce')

# Replace inf with NaN, then impute
for df_ in (X_train, X_val, X_test):
    df_.replace([np.inf, -np.inf], np.nan, inplace=True)

num_imputer = SimpleImputer(strategy="median")
X_train = pd.DataFrame(num_imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val   = pd.DataFrame(num_imputer.transform(X_val),   columns=X_val.columns,   index=X_val.index)
X_test  = pd.DataFrame(num_imputer.transform(X_test),  columns=X_test.columns,  index=X_test.index)

# (Optional) Assert no NaNs remain
assert not np.isnan(X_train.values).any(), "NaNs remain in X_train after imputation"
assert not np.isnan(X_val.values).any(),   "NaNs remain in X_val after imputation"
assert not np.isnan(X_test.values).any(),  "NaNs remain in X_test after imputation"

print("✓ No NaNs remain in X_train/X_val/X_test.")



STEP 5.5: Final Imputation & Sanity Checks (pre-SMOTE)
✓ No NaNs remain in X_train/X_val/X_test.


In [11]:
# ==========================================
# STEP 6: HANDLE CLASS IMBALANCE
# ==========================================
print("\n" + "=" * 60)
print("STEP 6: Handling Class Imbalance with SMOTE")
print("=" * 60)

# Apply SMOTE (Synthetic Minority Over-sampling Technique)
# This creates synthetic samples of the minority class
# Important when target classes are imbalanced
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Original training set: {X_train.shape}")
print(f"Balanced training set: {X_train_balanced.shape}")
print(f"\nClass distribution after SMOTE:")
print(pd.Series(y_train_balanced).value_counts())



STEP 6: Handling Class Imbalance with SMOTE
Original training set: (6370, 30)
Balanced training set: (8358, 30)

Class distribution after SMOTE:
heart_attack_risk
0    4179
1    4179
Name: count, dtype: int64


In [13]:
# ==========================================
# STEP 7: FEATURE SCALING
# ==========================================
print("\n" + "=" * 60)
print("STEP 7: Feature Scaling")
print("=" * 60)

# Standardize features (mean=0, std=1)
# This is important for models sensitive to feature scales
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Features standardized using StandardScaler")
print("Mean ≈ 0, Standard Deviation ≈ 1")


STEP 7: Feature Scaling
Features standardized using StandardScaler
Mean ≈ 0, Standard Deviation ≈ 1


KeyboardInterrupt: 

In [17]:
# ==========================================
# STEP 8: MODEL TRAINING
# ==========================================

# We'll train multiple models and compare their performance
models = {
    'Logistic Regression': LogisticRegression(
        random_state=42, 
        max_iter=5000,
        class_weight='balanced'
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_scaled, y_train_balanced)
    
    # Make predictions
    y_pred = model.predict(X_val_scaled)
    y_pred_proba = model.predict_proba(X_val_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)  # Primary metric for this task
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
    
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")  # Most important for this task
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  ROC-AUC:   {roc_auc:.4f}")


Training Logistic Regression...
  Accuracy:  0.5022
  Precision: 0.3451
  Recall:    0.4982
  F1-Score:  0.4078
  ROC-AUC:   0.5054

Training Random Forest...
  Accuracy:  0.6566
  Precision: 0.5556
  Recall:    0.0091
  F1-Score:  0.0180
  ROC-AUC:   0.4949

Training Gradient Boosting...
  Accuracy:  0.6353
  Precision: 0.4000
  Recall:    0.1204
  F1-Score:  0.1851
  ROC-AUC:   0.4970


ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


STEP 8 (Improved): Feature Selection + Advanced Models
✅ Selected top features: ['age', 'chol', 'hr', 'diabetes', 'family_history', 'obesity', 'alcohol', 'prev_heart_prob', 'med_use', 'stress_lvl', 'phys_act_days', 'sleep_hr', 'systolic_bp', 'diastolic_bp', 'country_freq', 'continent_freq', 'pulse_pressure', 'risk_factor_count', 'age_group', 'bmi_category']

Improved Model Performance (XGBoost):
Accuracy:  0.6309
Precision: 0.3876
Recall:    0.1259
F1-Score:  0.1901
ROC-AUC:   0.5156


In [49]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 1.1 MB/s eta 0:00:51
   ---------------------------------------- 0.5/56.8 MB 1.1 MB/s eta 0:00:51
    --------------------------------------- 0.8/56.8 MB 1.2 MB/s eta 0:00:49
    --------------------------------------- 1.0/56.8 MB 1.2 MB/s eta 0:00:48
    --------------------------------------- 1.3/56.8 MB 1.2 MB/s eta 0:00:48
   - -------------------------------------- 1.6/56.8 MB 1.2 MB/s eta 0:00:47
   - -------------------------------------- 1.8/56.8 MB 1.2 MB/s eta 0:00:48
   - -------------------------------------- 2.1/56.8 MB 1.2 MB/s eta 0:00:48
   - ----------------------