In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance

In [2]:
# Load the data
data = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [4]:
data.shape

(253680, 22)

In [5]:
data.head().T

Unnamed: 0,0,1,2,3,4
HeartDiseaseorAttack,0.0,0.0,0.0,0.0,0.0
HighBP,1.0,0.0,1.0,1.0,1.0
HighChol,1.0,0.0,1.0,0.0,1.0
CholCheck,1.0,0.0,1.0,1.0,1.0
BMI,40.0,25.0,28.0,27.0,24.0
Smoker,1.0,1.0,0.0,0.0,0.0
Stroke,0.0,0.0,0.0,0.0,0.0
Diabetes,0.0,0.0,0.0,0.0,0.0
PhysActivity,0.0,1.0,0.0,1.0,1.0
Fruits,0.0,0.0,1.0,1.0,1.0


In [6]:
# Split the data into features and labels
X = data.drop("HeartDiseaseorAttack", axis=1)
y = data["HeartDiseaseorAttack"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train different models
models = {"Random Forest": RandomForestClassifier(),
          "Logistic Regression": LogisticRegression(),
          "Support Vector Machine": SVC(),
          "K-Nearest Neighbors": KNeighborsClassifier()}

In [9]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an instance of the logistic regression class
clf = LogisticRegression()

# Fit the model to the scaled training data
clf.fit(X_train_scaled, y_train)

In [10]:
# Store the accuracy of each model
accuracies = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy
    print(f"{name} accuracy: {accuracy:.2f}")


Random Forest accuracy: 0.90


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression accuracy: 0.91


In [None]:
# Get the model with the highest accuracy
best_model_name = max(accuracies, key=accuracies.get)
best_model = models[best_model_name]

In [None]:
# Get the feature importances
perm = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
feature_importances = perm.importances_mean

In [None]:
# Get the feature names
feature_names = X.columns


In [None]:
# Create a dictionary of feature importances
feature_importance_dict = dict(zip(feature_names, feature_importances))

In [None]:
# Sort the dictionary by importance
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Print the sorted feature importances
print("Feature importances:")
for feature, importance in sorted_feature_importance.items():
    print(f"{feature}: {importance:.2f}")

Feature importances:
GenHlth: 0.00
Age: 0.00
Sex: 0.00
Stroke: 0.00
CholCheck: 0.00
HighChol: 0.00
DiffWalk: 0.00
Diabetes: 0.00
NoDocbcCost: 0.00
Smoker: 0.00
PhysActivity: 0.00
BMI: 0.00
Education: 0.00
AnyHealthcare: 0.00
HvyAlcoholConsump: -0.00
PhysHlth: -0.00
MentHlth: -0.00
HighBP: -0.00
Veggies: -0.00
Fruits: -0.00
Income: -0.00


In [None]:
n = 21
top_n_features = list(sorted_feature_importance.keys())[:n]

In [None]:
patient_data_top_n = {}
for feature in top_n_features:
   patient_data_top_n[feature] = input(f"Enter the {feature} of the patient: ")

Enter the GenHlth of the patient: 5
Enter the Age of the patient: 9
Enter the Sex of the patient: 0
Enter the Stroke of the patient: 0
Enter the CholCheck of the patient: 1
Enter the HighChol of the patient: 1
Enter the DiffWalk of the patient: 1
Enter the Diabetes of the patient: 0
Enter the NoDocbcCost of the patient: 0
Enter the Smoker of the patient: 1
Enter the PhysActivity of the patient: 0
Enter the BMI of the patient: 40
Enter the Education of the patient: 4
Enter the AnyHealthcare of the patient: 1
Enter the HvyAlcoholConsump of the patient: 0
Enter the PhysHlth of the patient: 15
Enter the MentHlth of the patient: 18
Enter the HighBP of the patient: 1
Enter the Veggies of the patient: 1
Enter the Fruits of the patient: 0
Enter the Income of the patient: 3


In [None]:
patient_data_top_n = pd.DataFrame(data=[patient_data_top_n], columns=patient_data_top_n.keys())

In [None]:
patient_prediction_top_n = best_model.predict(patient_data_top_n)
patient_prediction_prob_top_n = best_model.predict_proba(patient_data_top_n)

if patient_prediction_top_n[0] == 0:
    print("The patient is unlikely to have heart disease with probability of {} using top {} features.".format(patient_prediction_prob_top_n[0][0], n))
else:
    print("The patient is likely to have heart disease with probability of {} using top {} features.".format(patient_prediction_prob_top_n[0][1], n))


The patient is unlikely to have heart disease with probability of 0.9999999446050293 using top 21 features.


Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

