In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas_profiling as pp
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
#ensembling
from mlxtend.classifier import StackingCVClassifier

In [None]:
data = pd.read_csv('../input/heart-disease-prediction-using-logistic-regression/framingham.csv')
data.head()

In [None]:
# data.dropna(inplace = True) # ignore na
data = data.apply(lambda x: x.fillna(x.mean())) #replace na with mean
target = data["TenYearCHD"]
features = data.drop('TenYearCHD',axis=1)
sns.histplot(target)
data.info()

In [None]:
pp.ProfileReport(data)

In [None]:
smote = SMOTE()
x_smote ,y_smote = smote .fit_resample(features, target)
sns.histplot(y_smote)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42, shuffle = True)
xs_train ,xs_test ,ys_train ,ys_test = train_test_split(x_smote ,y_smote , test_size = 0.2 , random_state = 42 ,shuffle = True) 

# SVM 
## Original Data

In [None]:
kernels = ['rbf', 'poly', 'sigmoid']
best_svm = None
best_score = 0
best_pred = None
all_results = []
for kernel in kernels:
    print("Using kernel =", kernel)
    for c in range(1, 11):
        print("\tusing C={}".format(c), end = " ")
        svc =  SVC(kernel=kernel, C=c, probability = True)
        svc.fit(x_train, y_train)
        y_pred = svc.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        all_results.append([c, score, kernel])
        print("score = ", score)
        if score > best_score:
            best_score = score
            best_svm = svc
            best_pred = y_pred
all_results = np.array(all_results, dtype=object)

In [None]:
print("Best score using kernel={} with c = {}".format(best_svm.kernel, best_svm.C))
cm = confusion_matrix(y_test,best_pred)
print(classification_report(y_test,best_pred))
print("Accuracy score", accuracy_score(y_test, best_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

## Using data after oversampling

In [None]:
kernels = ['rbf', 'poly', 'sigmoid']
best_svm_smote = None
best_score_smote = 0
best_pred_smote = None
all_results_smote = []
for kernel in kernels:
    print("Using kernel =", kernel)
    for c in range(1, 11):
        print("\tusing C={}".format(c), end = " ")
        svc =  SVC(kernel=kernel, C=c, probability = True)
        svc.fit(xs_train, ys_train)
        ys_pred = svc.predict(xs_test)
        score = accuracy_score(ys_test, ys_pred)
        all_results_smote.append([c, score, kernel])
        print("score = ", score)
        if score > best_score_smote:
            best_score_smote = score
            best_svm_smote = svc
            best_pred_smote = y_pred
all_results_smote = np.array(all_results_smote, dtype=object)

In [None]:
print("Best score using kernel={} with c = {}".format(best_svm_smote.kernel, best_svm_smote.C))
cm = confusion_matrix(y_test,best_pred_smote)
print(classification_report(y_test,best_pred_smote))
print("Accuracy score", accuracy_score(y_test, best_pred_smote))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

In [None]:
plt.plot(all_results[:10][:, 0], all_results[:10][:, 1], label = "kernel(RBF)")
plt.plot(all_results[10:20][:, 0], all_results[10:20][:, 1], label = "kernel(Poly)")
plt.plot(all_results[20:][:, 0], all_results[20:][:, 1], label = "kernel(sigmoid)")
plt.plot(all_results_smote[:10][:, 0], all_results_smote[:10][:, 1], label = "kernel(RBF)-SMOTE")
plt.plot(all_results_smote[10:20][:, 0], all_results_smote[10:20][:, 1], label = "kernel(Poly)-SMOTE")
plt.plot(all_results_smote[20:][:, 0], all_results_smote[20:][:, 1], label = "kernel(sigmoid)-SMOTE")
plt.title("Scores on SVM by C value")
plt.ylabel("score")
plt.xticks(range(1, 11))
plt.xlabel("C")
plt.legend()
plt.show()

# Logistic Regression

## Original Data

In [None]:
all_results = []
best_lr = None
best_score = 0
best_pred = None
worst_param = None
for param in range((x_train.shape[1]) + 1):
    if param != x_train.shape[1]:
        print("Without using param",features.columns[param], end = " ")
        train = x_train.drop(x_train.columns[param], axis=1)
        test = x_test.drop(x_test.columns[param], axis=1)
    else:
        print("Using all parameters", end = " ")
    lr_model = LogisticRegression(random_state = 42, max_iter = 10000)
    lr_model.fit(train , y_train)
    y_pred = lr_model.predict(test)
    score = accuracy_score(y_test, y_pred)
    all_results.append([param, score])
    print("score", score)
    if score > best_score:
        best_lr = lr_model
        best_score = score
        best_pred = y_pred
        worst_param = param
print("Best without using", features.columns[worst_param])
cm = confusion_matrix(y_test,best_pred)
print(classification_report(y_test,best_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')
all_results = np.array(all_results, dtype=object)

## Using data after oversampling

In [None]:
all_results_smote = []
best_lr_smote = None
best_score_smote = 0
best_pred_smote = None
worst_param = None
for param in range((xs_train.shape[1]) + 1):
    if param != xs_train.shape[1]:
        print("Without using param",xs_train.columns[param], end = " ")
        train = xs_train.drop(xs_train.columns[param], axis=1)
        test = xs_test.drop(xs_test.columns[param], axis=1)
    else:
        print("Using all parameters", end = " ")
    lr_model = LogisticRegression(random_state = 42, max_iter = 10000)
    lr_model.fit(train , ys_train)
    ys_pred = lr_model.predict(test)
    score = accuracy_score(ys_test, ys_pred)
    all_results_smote.append([param, score])
    print("score", score)
    if score > best_score_smote:
        best_lr_smote = lr_model
        best_score_smote = score
        best_pred_smote = ys_pred
        worst_param = param
print("Best without using", features.columns[worst_param])
cm = confusion_matrix(ys_test,best_pred_smote)
print(classification_report(ys_test,best_pred_smote))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')
all_results_smote = np.array(all_results_smote, dtype=object)

In [None]:
labels = np.insert(x_train.columns, len(x_train.columns), "None")
plt.plot(all_results[:, 0], all_results[:, 1], label = "Original data")
plt.plot(all_results_smote[:, 0], all_results_smote[:, 1], label = "after SMOTE")
plt.title("Scores on SVM by C value")
plt.ylabel("score")
plt.xticks(range(len(labels)), labels = labels, rotation=90)
plt.xlabel("parameter removed")
plt.legend()
plt.show()

# Random forest, different criterions and #estimators¶
## Using the original dataset

In [None]:
criterions = ['gini', 'entropy']
best_score = -1
best_pred = []
best_forest = None
all_results = []
for criterion in criterions:
    print("Using", criterion)
    for estimators in range(10, 201, 10):
        print("\t{} estimators".format(estimators), end = " ")
        forest = RandomForestClassifier(n_estimators=estimators, criterion = criterion, random_state = 42)
        forest.fit(x_train, y_train)
        y_pred = forest.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        all_results.append([forest, y_pred, score, estimators])
        print("score = {}".format(score))
        if score > best_score:
            best_pred = y_pred
            best_score = score
            best_forest = forest
all_results = np.array(all_results, dtype=object)

## Using data after oversampling

In [None]:
criterions = ['gini', 'entropy']
best_score_smote = -1
best_pred_smote = []
best_forest_smote = None
all_results_smote = []
for criterion in criterions:
    print("Using", criterion)
    for estimators in range(10, 201, 10):
        print("\t{} estimators".format(estimators), end = " ")
        forest = RandomForestClassifier(n_estimators=estimators, criterion = criterion, random_state = 42)
        forest.fit(xs_train, ys_train)
        ys_pred = forest.predict(xs_test)
        score = accuracy_score(ys_test, ys_pred)
        all_results_smote.append([forest, ys_pred, score, estimators])

        print("score = {}".format(score))
        if score > best_score_smote:
            best_pred_smote = ys_pred
            best_score_smote = score
            best_forest_smote = forest
all_results_smote = np.array(all_results_smote, dtype=object)

In [None]:
plt.plot(all_results[:20][:, 3], all_results[:20][:, 2], label = "Gini")
plt.plot(all_results[20:][:, 3], all_results[20:][:, 2], label = 'Entropy')
plt.plot(all_results_smote[:20][:, 3], all_results_smote[:20][:, 2], label = "Gini - smote")
plt.plot(all_results_smote[20:][:, 3], all_results_smote[20:][:, 2], label = 'Entropy - smote')
plt.title("Scores on random forest by number of estimators(using smote)")
plt.ylabel("score")
plt.xlabel("number of estimators")
plt.legend()
plt.show()

In [None]:
print("\t\tOriginal data")
print("Best score using criterion={} with {} estimators".format(best_forest.criterion, len(best_forest
                                                                                         .estimators_)))
cm = confusion_matrix(y_test,best_pred)
print(classification_report(y_test,best_pred))
print("Accuracy score", accuracy_score(y_test, best_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

In [None]:
print("\t\tafter smote")

print("Best score using criterion={} with {} estimators".format(best_forest_smote.criterion, len(best_forest_smote
                                                                                         .estimators_)))
cm = confusion_matrix(ys_test,best_pred_smote)
print(classification_report(ys_test,best_pred_smote))
print("Accuracy score", accuracy_score(ys_test, best_pred_smote))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

# KNN - different K's
## Original dataset

In [None]:
best_score = -1
best_pred = []
best_knn = None
all_results = []
for p in [1, 2]:
    print("Using l{}".format(p))
    for k in range(2, 31):
        print("\tUsing k={}".format(k), end = ' ')
        knn = KNeighborsClassifier(n_neighbors=k, p = p)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        all_results.append([k, score])
        print("score = {}".format(score))
        if score > best_score:
            best_pred = y_pred
            best_score = score
            best_knn = knn
all_results = np.array(all_results, dtype=object)

In [None]:
best_score_smote = -1
best_pred_smote = []
best_knn_smote = None
all_results_smote = []
for p in [1, 2]:
    print("Using l{}".format(p))
    for k in range(2, 31):
        print("\tUsing k={}".format(k), end = ' ')
        knn = KNeighborsClassifier(n_neighbors=k, p = p)
        knn.fit(xs_train, ys_train)
        ys_pred = knn.predict(xs_test)
        score = accuracy_score(ys_test, ys_pred)
        all_results_smote.append([k, score])
        print("score = {}".format(score))
        if score > best_score_smote:
            best_pred_smote = ys_pred
            best_score_smote = score
            best_knn_smote = knn
all_results_smote = np.array(all_results_smote, dtype=object)

In [None]:
plt.plot(all_results[:29][:, 0], all_results[:29][:, 1], label = "l1")
plt.plot(all_results[29:][:, 0], all_results[29:][:, 1], label = 'l2')
plt.plot(all_results_smote[:29][:, 0], all_results_smote[:29][:, 1], label = 'l1 - smote')
plt.plot(all_results_smote[29:][:, 0], all_results_smote[29:][:, 1], label = 'l2 - smote')
plt.title("Scores on KNN by k value")
plt.ylabel("score")
plt.xticks(range(2, 31), rotation=90)
plt.xlabel("k")
plt.legend()
plt.show()

In [None]:
print("\t\tOriginal data")
print("Best score using l{} with {} neighbors".format(best_knn.p, best_knn.n_neighbors))
cm = confusion_matrix(y_test,best_pred)
print(classification_report(y_test,best_pred))
print("Accuracy score", accuracy_score(y_test, best_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

In [None]:
print("\t\tAfter smote")
print("Best score using l{} with {} neighbors".format(best_knn_smote.p, best_knn_smote.n_neighbors))
cm = confusion_matrix(ys_test,best_pred_smote)
print(classification_report(ys_test,best_pred_smote))
print("Accuracy score", accuracy_score(ys_test, best_pred_smote))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

In [None]:
ensemble = [best_forest_smote, best_knn, best_lr, best_svm]
pred = []
for model in ensemble:
    print(model)
    if model is best_lr and worst_param is not features.shape[1]:
        pred.append(model.predict_proba(x_test.drop(x_test.columns[worst_param], axis=1)))
    else:
        pred.append(model.predict_proba(x_test))
probs = sum(pred)/len(ensemble)
final_pred = [0 if p[0] > p[1] else 1 for p in probs]

In [None]:
cm = confusion_matrix(y_test,final_pred)
print(classification_report(y_test,final_pred))
print(accuracy_score(y_test, final_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')