## STA160: Midterm Exam

## Heart Disease Health Indicators

### 1. Load Important Libraries

In [None]:
import numpy as np
import pandas as pd
import sklearn
import time
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")

from sklearn.ensemble import ExtraTreesClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix, plot_roc_curve

### 2. Load the Dataset

In [None]:
heart = pd.read_csv("/Users/selamawit/Desktop/STA160/Midterm/heart.csv")

|No| Variable Code | Name | Description |
|---| --- | --- | --- |
|1 | HeartDiseaseorAttack | Heart Disease or Heart Attack | Respondents who reported having heart disease or heart attack |
|2 | HighBP | High Blood Pressure | Patients with high blood pressure |
|3 | HighChol | High Cholestrol | Patients with high cholestrol |
|4 | CholCheck | Cholestrol Check | Patients who have cholestrol within past five years |
|5 | BMI | Body Mass Index | Patients record of Body Mass Index |
|6 | Smoker | Smoking | Patients who smoked at least 100 cigarettes in their entire life |
|7 | Stroke | Stroke | Patients who had stroke |
|8 | Diabetes | Diabetes | Patients who had diabetes |
|9 | PhysActivity | Physical Activity | Patients who exercised the past thirty days other than regular job |
|10 | DiffWalk | Difficulty Walking | Patients who experience serious difficulty walking or climbing stairs |
|11 | Age | Age | fourteen level of age category |

In [None]:
heart.head()

In [None]:
heart.shape

### 3. Exploratory Data Analysis

In [None]:
print('DataFrame after dropping the rows having missing values:', heart.drop_duplicates(inplace=True))
print('Original dataframe dimension:', heart.shape)

In [None]:
for col in heart.columns:
    print("{}: {}".format(col, heart[col].isnull().sum()))

In [None]:
#significant variables
data = heart[["HeartDiseaseorAttack", "HighBP", "HighChol", "Smoker", "Stroke","Diabetes", "PhysActivity", "DiffWalk", "Age"]]
data.describe()

In [None]:
print("Information about the dataset:")
print(data.info())

In [None]:
data.Age.unique()

### 4. Classification Model Building using Cross Validation

In [None]:
data1 = data.copy()
X = data1.iloc[:,2:]
Y = data1["HeartDiseaseorAttack"]
print("Shape of X design matrix: ", X.shape)
print("Shape of Y vector: ", Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=10, stratify = Y)
print(pd.value_counts(Y_train)/len(Y_train))

#Scaling for training dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Compute running time:
start = time.time()

# Model based on Classification Algorithms
model = []
model.append(('Linear Discriminant', LinearDiscriminantAnalysis()))
model.append(('Logistic Regression', LogisticRegression()))
model.append(('KNN', KNeighborsClassifier()))
model.append(('Support Vector Machine', SVC()))
model.append(('Naive Bayes', GaussianNB()))
model.append(('Decision Tree', DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 2, min_samples_leaf = 5)
))

# Model Evaluation
results = []
names = []

for n, m in model:
    kfold = KFold(n_splits=10, random_state=None)
    cross_val = cross_val_score(m, X_train, Y_train, cv = kfold, scoring = "accuracy")
    results.append(cross_val)
    names.append(n)
    print("The estimated accuracy score for", n, ":")
    print(cross_val)    
    print(n, ":", "%0.2f accuracy with a standard deviation of %0.2f" % (cross_val.mean(), cross_val.std()))
                      
end = time.time()
print("Computation running time: ", round((end - start)/60,2), "secs.")

### 6. Classification Model Evaluation Metrics

<div class="alert-success">
1. Confusion Matrix 
</div>

### (i) Linear Discriminant Analysis

In [None]:
start = time.time()

### Build and train model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train).transform(X_train)
print("LDA model correctly predicts whether a patient has heart disease attack or not about",round(lda.score(X_train, Y_train)*100, 2), "% of the time.")

### Check performance of our classifier
print('Accuracy for training set for LDA = {}'.format((round(confusion_matrix(Y_train, lda.predict(X_train))[0][0] + confusion_matrix(Y_train, lda.predict(X_train))[1][1])/len(Y_train))*100,2))
print(confusion_matrix(Y_train, lda.predict(X_train)))
Yhat1 = lda.predict(X_test)
print('Accuracy for test set for LDA = {}'.format((round(confusion_matrix(Y_test, Yhat1)[0][0] + confusion_matrix(Y_test, Yhat1)[1][1])/len(Y_test))*100,2))
print(confusion_matrix(Y_test, Yhat1))

### Plot confusion
plot_confusion_matrix(estimator=lda,
                      X = X_test,
                      y_true = Y_test,
                      display_labels=["No Disease/Attack", "Yes Disease/Attack"], cmap = "Pastel1_r")

### Computation time
end = time.time()
print("Computation running time: ", round((end - start)/60,2), "secs.")

### (ii) Logistic Regression

In [None]:
start = time.time()
### Build and train model
logR = LogisticRegression(penalty = "none")
logR.fit(X_train, Y_train)   
print("Logistic Regression model correctly predicts whether a patient has heart disease attack or not about",round(logR.score(X_train, Y_train)*100, 2), "%.")

### Check performance of classifier
Yhat2 = logR.predict(X_test) #predicts label
# print('Accuracy for training set for Logistic Regression = {}'.format(
#     (round(confusion_matrix(Y_train, logR.predict(X_train))[0][0] + confusion_matrix(Y_train, logR.predict(X_train))[1][1])/len(Y_train))*100,2))
# print(confusion_matrix(Y_train, logR.predict(X_train)))
print('Accuracy for test set for Logistic Regression = {}'.format(
    (round(confusion_matrix(Y_test, Yhat2)[0][0] + confusion_matrix(Y_test, Yhat2)[1][1])/len(Y_test))*100,2))
print(confusion_matrix(Y_test, Yhat2))

### Plot confusion matrix
plot_confusion_matrix(estimator=logR,
                      X = X_test,
                      y_true = Y_test,
                      display_labels=["No Disease/Attack", "Yes Disease/Attack"], cmap = "Pastel1_r")

### Computation time
end = time.time()
print("Computation running time: ", round((end - start)/60,2), "secs.")

### (iii) Decision Tree

In [None]:
start = time.time()

###############################
### Build and train model
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 2, min_samples_leaf = 5)
Yhat_e = clf_entropy.fit(X_train, Y_train)

### Check performance of classifier
Y_pred_e = Yhat_e.predict(X_test)
print('Accuracy for test set for Decision Tree = {}'.format(
    (round(confusion_matrix(Y_test, Y_pred_e)[0][0] + confusion_matrix(Y_test, Y_pred_e)[1][1])/len(Y_test))*100,2))
print(confusion_matrix(Y_test, Y_pred_e))

### Plot confusion matrix
plot_confusion_matrix(estimator=clf_entropy,
                      X = X_test,
                      y_true = Y_test,
                      display_labels=["No Disease/Attack", "Yes Disease/Attack"], cmap = "Pastel1_r")

### Computation time
end = time.time()
print("Computation running time: ", round((end - start)/60,2), "secs.")

### (iv) Support Vector Machine

In [None]:
start = time.time()

###############################
### Build and train model
classifier = SVC(kernel = 'linear') 
classifier.fit(X_train, Y_train)   
print("SVM model correctly predicts whether a patient has heart disease attack or not about", classifier.score(X_train, Y_train), "%.")

### Check performance of classifier
# print('Accuracy for training set for svm = {}'.format(
#     (confusion_matrix(Y_train, classifier.predict(X_train))[0][0] + confusion_matrix(Y_train, classifier.predict(X_train))[1][1])/len(Y_train)))
# print(confusion_matrix(Y_train, classifier.predict(X_train)))
Yhat3 = classifier.predict(X_test) #predict Yhat
print('Accuracy for test set for svm = {}'.format(
    (confusion_matrix(Y_test, Yhat3)[0][0] + confusion_matrix(Y_test, Yhat2)[1][1])/len(Y_test)))
print(confusion_matrix(Y_test, Yhat3))

### Plot confusion matrix
plot_confusion_matrix(estimator=classifier,
                      X = X_test,
                      y_true = Y_test,
                      display_labels=["No Disease/Attack", "Yes Disease/Attack"], cmap = "Pastel1_r")

### Computation time
end = time.time()
print("Computation running time: ", round((end - start)/60,2), "secs.")

Although SVM have relatively good accuracy score, the running time is long.

<div class="alert-success">
2. Classification Report
</div>

### (i) Linear Discriminant

In [None]:
print(classification_report(y_true = Y_test,y_pred = lda.predict(X_test), target_names=["No Disease/Attack", "Yes Disease/Attack"]))

### (ii) Logistic Regression

In [None]:
print(classification_report(y_true = Y_test,y_pred = Yhat2, target_names=["No Disease/Attack", "Yes Disease/Attack"]))

### (iii) Decision Tree

In [None]:
print(classification_report(Y_test, Y_pred_e, target_names=["No Disease/Attack", "Yes Disease/Attack"]))

Both FN and TN being 0 is because of the class inbalance we have from the data.

### (iv) Support Vector Machine

In [None]:
print(classification_report(y_true = Y_test,y_pred = Yhat3, target_names=["No Disease/Attack", "Yes Disease/Attack"]))

Both FN and TN being 0 is because of the class inbalance we have from the data.

<div class="alert-success">
3. Comparison of Classifiers using ROC curve
</div>

In [None]:
roc_lda = plot_roc_curve(estimator=lda, X=X_train, y = Y_train)
roc_log_reg = plot_roc_curve(estimator=logR, X = X_train, y = Y_train, ax = roc_lda.ax_)
roc_log_reg = plot_roc_curve(estimator=clf_entropy, X = X_train, y = Y_train, ax = roc_lda.ax_)
roc_log_reg = plot_roc_curve(estimator=classifier, X = X_train, y = Y_train, ax = roc_lda.ax_)
plt.title("ROC Curve Comparison")
plt.show()

Both LDA and Logistic classifiers are close to 1 indicating they are perfect classifier for all threshold values. 

### 7. Reference:

https://www.kaggle.com/datasets/alexteboul/heart-disease-health-indicators-dataset