In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost


df = pd.read_csv("./datasets/heart.csv")
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg_type', 'max_heart_rate_achieved',
              'exercise_induced_angina', 'st_depression', 'st_slope_type', 'num_major_vessels', 'thalassemia_type', 'target']
# cp - chest_pain_type
df.loc[df['chest_pain_type'] == 0, 'chest_pain_type'] = 'asymptomatic'
df.loc[df['chest_pain_type'] == 1, 'chest_pain_type'] = 'atypical angina'
df.loc[df['chest_pain_type'] == 2, 'chest_pain_type'] = 'non-anginal pain'
df.loc[df['chest_pain_type'] == 3, 'chest_pain_type'] = 'typical angina'
# restecg - rest_ecg_type
df.loc[df['rest_ecg_type'] == 0, 'rest_ecg_type'] = 'left ventricular hypertrophy'
df.loc[df['rest_ecg_type'] == 1, 'rest_ecg_type'] = 'normal'
df.loc[df['rest_ecg_type'] == 2, 'rest_ecg_type'] = 'ST-T wave abnormality'
# slope - st_slope_type
df.loc[df['st_slope_type'] == 0, 'st_slope_type'] = 'downsloping'
df.loc[df['st_slope_type'] == 1, 'st_slope_type'] = 'flat'
df.loc[df['st_slope_type'] == 2, 'st_slope_type'] = 'upsloping'
# thal - thalassemia_type
df.loc[df['thalassemia_type'] == 0, 'thalassemia_type'] = 'nothing'
df.loc[df['thalassemia_type'] == 1, 'thalassemia_type'] = 'fixed defect'
df.loc[df['thalassemia_type'] == 2, 'thalassemia_type'] = 'normal'
df.loc[df['thalassemia_type'] == 3, 'thalassemia_type'] = 'reversable defect'
data = pd.get_dummies(df, drop_first=False)
df_temp = data['thalassemia_type_fixed defect']
data = pd.get_dummies(df, drop_first=True)
display(data.head())
frames = [data, df_temp]
result = pd.concat(frames, axis=1)
result.drop('thalassemia_type_nothing', axis=1, inplace=True)
X = result.drop('target', axis=1)
y = result['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

logre = LogisticRegression()
logre.fit(X_train, y_train)
y_pred = logre.predict(X_test)
display(accuracy_score(y_test, y_pred))
dt = DecisionTreeClassifier(random_state=42)

# fitting the model
dt.fit(X_train, y_train)

# calculating the predictions
y_pred = dt.predict(X_test)

# displaying the test accuracy
display("The test accuracy score of Decision Tree is ",
      accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='macro'))

# instantiating the object
rf = RandomForestClassifier()

# fitting the model
rf.fit(X_train, y_train)

# calculating the predictions
y_pred = dt.predict(X_test)

# displaying the test accuracy
display("The test accuracy score of Random Forest is ",
      accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='macro'))

# instantiate the classifier
xgb_classifier = xgboost.XGBClassifier(n_estimators=50, random_state=42)

# fitting the model
xgb_classifier.fit(X_train, y_train)

# predicting values
y_pred = xgb_classifier.predict(X_test)
display("The test accuracy score of Gradient Boosting Classifier is ",
      accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='macro'))
# instantiating the object and fitting
clf = SVC(kernel='linear', C=1, random_state=42).fit(X_train, y_train)

# predicting the values
y_pred = clf.predict(X_test)

# displaying the test accuracy
display("The test accuracy score of SVM is ", accuracy_score(
    y_test, y_pred), f1_score(y_test, y_pred, average='macro'))
perm_importance = permutation_importance(
    clf, X_test, y_test, n_repeats=30, random_state=42)

# Get feature importances
rf_importances = rf.feature_importances_
xgb_importances = xgb_classifier.feature_importances_
svm_importances = perm_importance
logireg_importances = abs(logre.coef_[0])

# Create a DataFrame with feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'RandomForest': rf_importances,
    'XGBoost': xgb_importances,
    'logistice regression': logireg_importances,
    'svm': svm_importances.importances_mean,

})

display(feature_importances)


  df.loc[df['chest_pain_type'] == 0, 'chest_pain_type'] = 'asymptomatic'
  df.loc[df['rest_ecg_type'] == 0, 'rest_ecg_type'] = 'left ventricular hypertrophy'
  df.loc[df['st_slope_type'] == 0, 'st_slope_type'] = 'downsloping'
  df.loc[df['thalassemia_type'] == 0, 'thalassemia_type'] = 'nothing'


Unnamed: 0,age,sex,resting_blood_pressure,cholesterol,fasting_blood_sugar,max_heart_rate_achieved,exercise_induced_angina,st_depression,num_major_vessels,target,chest_pain_type_atypical angina,chest_pain_type_non-anginal pain,chest_pain_type_typical angina,rest_ecg_type_left ventricular hypertrophy,rest_ecg_type_normal,st_slope_type_flat,st_slope_type_upsloping,thalassemia_type_normal,thalassemia_type_nothing,thalassemia_type_reversable defect
0,63,1,145,233,1,150,0,2.3,0,1,False,False,True,True,False,False,False,False,False,False
1,37,1,130,250,0,187,0,3.5,0,1,False,True,False,False,True,False,False,True,False,False
2,41,0,130,204,0,172,0,1.4,0,1,True,False,False,True,False,False,True,True,False,False
3,56,1,120,236,0,178,0,0.8,0,1,True,False,False,False,True,False,True,True,False,False
4,57,0,120,354,0,163,1,0.6,0,1,False,False,False,False,True,False,True,True,False,False


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8524590163934426

'The test accuracy score of Decision Tree is '

0.7540983606557377

0.7516960651289009

'The test accuracy score of Random Forest is '

0.7540983606557377

0.7516960651289009

'The test accuracy score of Gradient Boosting Classifier is '

0.819672131147541

0.8194780737153619

'The test accuracy score of SVM is '

0.8524590163934426

0.8510176390773405

Unnamed: 0,Feature,RandomForest,XGBoost,logistice regression,svm
0,age,0.087602,0.027908,0.005461,0.004372
1,sex,0.034333,0.050157,1.165797,0.003279
2,resting_blood_pressure,0.072512,0.024536,0.007629,0.010383
3,cholesterol,0.082265,0.023193,0.004386,0.0
4,fasting_blood_sugar,0.008195,0.00776,0.017011,0.001639
5,max_heart_rate_achieved,0.113511,0.027061,0.023979,0.03388
6,exercise_induced_angina,0.061938,0.081167,0.959432,0.003825
7,st_depression,0.088788,0.035846,0.32748,0.020219
8,num_major_vessels,0.131887,0.100246,0.916803,0.037705
9,chest_pain_type_atypical angina,0.011377,0.015714,0.278689,0.0
