In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, classification_report

In [2]:
dataset = pd.read_csv("../input/heart-failure-prediction/heart.csv")
dataset.info()
print("The target is HeartDisease and its values are", dataset['HeartDisease'].value_counts())

In [3]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtypes=='O']

print("The following are categorical features:", categorical_features) 

In [4]:
for feature in categorical_features:
    labels_ordered=dataset.groupby([feature])['HeartDisease'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    dataset[feature]=dataset[feature].map(labels_ordered)

In [5]:
feature_scale=[feature for feature in dataset.columns if feature not in ['HeartDisease']]
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale])
scaler.transform(dataset[feature_scale])

dataset = pd.concat([dataset[['HeartDisease']].reset_index(drop=True), 
                     pd.DataFrame(scaler.transform(dataset[feature_scale]),
                                  columns=feature_scale)], axis=1)

In [6]:
predict= 'HeartDisease'

X = np.array(dataset.drop([predict], 1))
y = np.array(dataset[predict])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)




pipeline_LR = Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=42))])
pipeline_DTC = Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('lr_classifier',DecisionTreeClassifier(random_state=42))])
pipeline_RFC = Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('lr_classifier',RandomForestClassifier(random_state=42))])
pipeline_KNC = Pipeline([('scalar4',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('lr_classifier',KNeighborsClassifier(n_neighbors=3))])
pipeline_GBC = Pipeline([('scalar5',StandardScaler()),
                     ('pca5',PCA(n_components=2)),
                     ('lr_classifier',GradientBoostingClassifier(random_state=42))])
pipeline_SVC = Pipeline([('scalar6',StandardScaler()),
                     ('pca6',PCA(n_components=2)),
                     ('lr_classifier',SVC(random_state=42))])

pipelines = [pipeline_LR, pipeline_DTC, pipeline_RFC, pipeline_KNC, pipeline_GBC, pipeline_SVC]



best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""

pipe_dict = {0: 'Logistic Regression', 
             1: 'Decision Tree', 
             2: 'RandomForest', 
             3: 'KNeighborsClassifier', 
             4: 'GradientBoostingClassifier', 
             5: 'SVC'}

for pipe in pipelines:
	pipe.fit(X_train, y_train)


for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))