# ADASYN (Adaptive Synthetic Sampling)

In [3]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold
from imblearn.over_sampling import ADASYN  # For handling imbalanced data

# Load dataset
file_path = "Indian_heart_disease.csv"  # Ensure correct path if necessary
df = pd.read_csv(file_path)

# Handling missing values
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Fill numerical columns with median
imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])
# Fill categorical columns with mode
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

# Encoding categorical variables
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

# Splitting data into features and target
X = df.drop(columns=["Heart Disease Status"])
y = df["Heart Disease Status"]

# Normalize numerical features
scaler = MinMaxScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

#scaler = StandardScaler()
#X[num_cols] = scaler.fit_transform(X[num_cols])

# Apply ADASYN to handle imbalanced data
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# Check the class distribution after ADASYN
print("Class distribution after ADASYN:")
print(pd.Series(y_resampled).value_counts())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

# Initialize models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Naïve Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Train and evaluate models
accuracy_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy

# Display results before tuning
print("\nModel Accuracy Before Hyperparameter Tuning:")
for model, acc in accuracy_results.items():
    print(f"{model}: {acc:.4f}")

# Evaluate with additional metrics (e.g., classification report, confusion matrix, AUC-ROC)
print("\nDetailed Evaluation for Random Forest:")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False, inplace=True)

# Display most important features
print("Top Features for Predicting Heart Disease:")
print(feature_importance)  # Show top 10 features

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# AUC-ROC Score
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC-ROC Score: {auc_roc:.4f}")

Class distribution after ADASYN:
Heart Disease Status
1    8078
0    8000
Name: count, dtype: int64


Parameters: { "use_label_encoder" } are not used.




Model Accuracy Before Hyperparameter Tuning:
Decision Tree: 0.6734
Naïve Bayes: 0.6465
K-Nearest Neighbors: 0.7102
SVM: 0.6891
Random Forest: 0.8050
AdaBoost: 0.6639
XGBoost: 0.8017
Logistic Regression: 0.5552
Gradient Boosting: 0.7060

Detailed Evaluation for Random Forest:
Top Features for Predicting Heart Disease:
Age                     0.096949
Blood Pressure          0.094142
Fasting Blood Sugar     0.092233
Homocysteine Level      0.090698
Cholesterol Level       0.090462
CRP Level               0.089958
BMI                     0.089862
Sleep Hours             0.087665
Triglyceride Level      0.085288
Stress Level            0.023149
Exercise Habits         0.022409
Sugar Consumption       0.022346
Alcohol Consumption     0.021859
Family Heart Disease    0.013598
High Blood Pressure     0.013548
Smoking                 0.013402
Gender                  0.013383
High LDL Cholesterol    0.013356
Diabetes                0.012862
Low HDL Cholesterol     0.012832
dtype: float64

Conf

## Apply K-Fold Cross-Validation (k=10)

In [6]:

# Perform 10-Fold Cross-Validation
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_resampled, y_resampled, cv=10, scoring='accuracy')
    cv_results[name] = scores.max()

# Display results
print("Model Accuracy with 10-Fold Cross-Validation:")
for model, acc in cv_results.items():
    print(f"{model}: {acc:.4f}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model Accuracy with 10-Fold Cross-Validation:
Decision Tree: 0.7121
Naïve Bayes: 0.7214
K-Nearest Neighbors: 0.7394
SVM: 0.7400
Random Forest: 0.8775
AdaBoost: 0.7245
XGBoost: 0.8812
Logistic Regression: 0.6063
Gradient Boosting: 0.7879


## Hyperparameter Tuning using GridSearchCV

In [8]:

# Hyperparameter tuning for selected models
param_grids = {
    "Random Forest": {
        'n_estimators': [10,50,100,200],
        'max_depth': [10,20,50, None]
    },
    "SVM": {
        'C': [0.1, 1, 10,],
        'kernel': ['linear', 'rbf']
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3,5,7],
        'weights': ['uniform', 'distance']
    },
    "XGBoost": {
         'n_estimators': [50, 100, 200],
         'learning_rate': [0.01, 0.1, 0.2],
         'max_depth': [3, 6, 9]
    }
}

best_models = {}
for model_name, param_grid in param_grids.items():
    model = models[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

# Evaluate tuned models
tuned_results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    tuned_results[name] = accuracy

# Display results after tuning
print("Model Accuracy After Hyperparameter Tuning:")
for model, acc in tuned_results.items():
    print(f"{model}: {acc:.4f}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Model Accuracy After Hyperparameter Tuning:
Random Forest: 0.8182
SVM: 0.7649
K-Nearest Neighbors: 0.7396
XGBoost: 0.8276


In [9]:

# Hyperparameter tuning for selected models
param_grids = {
    "Random Forest": {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly']
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    "XGBoost": {
        'n_estimators': [50, 100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 6, 9],
        'colsample_bytree': [0.5, 0.7, 1.0]
    }
}

best_models = {}
for model_name, param_grid in param_grids.items():
    model = models[model_name]
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_iter=10, n_jobs=-1, random_state=42)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

# Evaluate tuned models
tuned_results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    tuned_results[name] = accuracy

# Display results after tuning
print("Model Accuracy After Hyperparameter Tuning:")
for model, acc in tuned_results.items():
    print(f"{model}: {acc:.4f}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Model Accuracy After Hyperparameter Tuning:
Random Forest: 0.7933
SVM: 0.7649
K-Nearest Neighbors: 0.7396
XGBoost: 0.8244


In [10]:

import pickle

# Assuming 'model' is your trained machine learning model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved as model.pkl")
    

Model saved as model.pkl
