In [2]:
library(tidyverse)
library(testthat)
library(digest)
library(repr)
library(tidymodels)
library(GGally)
library(ISLR)
library(cowplot)
library(RColorBrewer)
library(themis)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘testthat’


The following object is masked from ‘package:dplyr’:

    matches


The following object is masked from ‘package:purrr’:

ERROR: Error in library(themis): there is no package called ‘themis’


In [None]:
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

# Load our dataset
chest_pain_data <- read_csv(url, col_name = FALSE) %>%
rename(age = X1,
       sex = X2,
       chest_pain_type = X3,
       trest_bps = X4,
       cholesterol = X5,
       fasting_blood_sugar = X6,
       resting_ecg = X7,
       max_heart_rate = X8,
       exercise_induced_angina = X9,
       oldpeak = X10,
       slope = X11,
       no_vessels_colored = X12,
       thal = X13,
       healthy = X14)

glimpse(chest_pain_data)

In [None]:
# select columns we're using and convert chest_pain_type to factor
chest_pain_data <- chest_pain_data %>%
select(chest_pain_type, age, trest_bps, cholesterol, max_heart_rate) %>%
mutate(chest_pain_type = as_factor(chest_pain_type))

slice(chest_pain_data,1:6)


In [None]:
# set the seed
set.seed(10)

In [None]:
# -- Value 1: typical angina
# -- Value 2: atypical angina
# -- Value 3: non-anginal pain
# -- Value 4: asymptomatic
# age cholesterol trest_bps max_heart_rate

options(repr.plot.width = 18, repr.plot.height = 9)    

age_vs_cho <- chest_pain_data %>%
  ggplot(aes(x = age, y = cholesterol, color = chest_pain_type)) +
  geom_point(alpha = 0.7) +
   labs(x = "Age", 
        y = "Cholesterol",
        color = "Chest Pain Type") +
  scale_color_manual(labels = c("Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"),
                     values = c("orange2", "steelblue2","lightgreen","violet"))+
    ggtitle("Age vs Cholesterol") +
    theme(text = element_text(size = 15))
  
age_vs_bp <- chest_pain_data %>%
  ggplot(aes(x = age, y = trest_bps, color = chest_pain_type)) +
  geom_point(alpha = 0.7) +
  labs(x = "Age", 
       y = "Resting Blood Pressure",
       color = "Chest Pain Type") +
  scale_color_manual(labels = c("Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"),
                     values = c("orange2", "steelblue2","lightgreen","violet"))+
    ggtitle("Age vs Resting Blood Pressure") +
    theme(text = element_text(size = 15))

age_vs_mhr <- chest_pain_data %>%
  ggplot(aes(x = age, y = max_heart_rate, color = chest_pain_type)) +
  geom_point(alpha = 0.7) +
  labs(x = "Age", 
       y = "Max Heart Rate",
       color = "Chest Pain Type") +
  scale_color_manual(labels = c("Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"),
                     values = c("orange2", "steelblue2","lightgreen","violet"))+
    ggtitle("Age vs Max Heart Rate") +
    theme(text = element_text(size = 15))


cho_vs_bp <- chest_pain_data %>%
  ggplot(aes(x = cholesterol, y = trest_bps, color = chest_pain_type)) +
  geom_point(alpha = 0.7) +
  labs(x = "Cholesterol", 
       y = "Resting Blood Pressure",
       color = "Chest pain Type") +
  scale_color_manual(labels = c("Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"),
                     values = c("orange2", "steelblue2","lightgreen","violet"))+
    ggtitle("Age vs Resting Blood Pressure") +
    theme(text = element_text(size = 15))


cho_vs_mhr <- chest_pain_data %>%
  ggplot(aes(x = cholesterol, y = max_heart_rate, color = chest_pain_type)) +
  geom_point(alpha = 0.7) +
  labs(x = "Cholesterol", 
       y = "Max Heart Rate",
       color = "Chest pain Type") +
  scale_color_manual(labels = c("Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"),
                     values = c("orange2", "steelblue2","lightgreen","violet"))+
    ggtitle("Cholesterol vs Max Heart Rate") +
    theme(text = element_text(size = 15))

rbp_vs_mhr <- chest_pain_data %>%
  ggplot(aes(x = trest_bps, y = max_heart_rate, color = chest_pain_type)) +
  geom_point(alpha = 0.7) +
  labs(x = "Resting Blood Pressure", 
       y = "Max Heart Rate",
       color = "Chest pain Type") +
  scale_color_manual(labels = c("Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"),
                     values = c("orange2", "steelblue2","lightgreen","violet"))+
    ggtitle("Resting Blood Pressure vs Max Heart Rate") +
    theme(text = element_text(size = 15))

panel <- plot_grid(age_vs_cho,
                   age_vs_bp,
                   age_vs_mhr,
                   cho_vs_bp,
                   cho_vs_mhr,
                   rbp_vs_mhr,
                   nrow = 2, 
                   ncol = 3)
panel

In [None]:
# split our dataset into a training dataset and a testing dataset
# Randomly take 75% of the data in the training set. 
# This will be proportional to the different number of fruit names in the dataset.

chest_pain_data_split <- initial_split(chest_pain_data, prop = 0.75, strata =chest_pain_type )  
chest_pain_data_train <- training(chest_pain_data_split)   
chest_pain_data_test <- testing(chest_pain_data_split)

slice(chest_pain_data_train,1:6) 
slice(chest_pain_data_test,1:6) 

In [None]:
# cross validation
vfold <- vfold_cv(chest_pain_data_train, v=5, strata = chest_pain_type)

# model
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("classification")

In [None]:
# recipe and standardization
cp_recipe <- recipe(chest_pain_type~age+max_heart_rate, data = chest_pain_data_train) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

# workflow
knn_result <- workflow() %>%
    add_recipe(cp_recipe) %>%
    add_model(knn_tune) %>%
    tune_grid(resamples = vfold, grid = 50) %>%
    collect_metrics()
slice (knn_result, 1:6)

# accuracy
accuracies <- knn_result %>%
    filter(.metric == "accuracy")

options(repr.plot.width = 7, repr.plot.height = 6)    
# accuracy plot
cross_val_plot <- accuracies %>%
    ggplot(aes(x=neighbors, y=mean)) +
    geom_point()+
    geom_line() +
    labs(x="Neighbors", y="Accuracy Estimate") +
    ggtitle("n = 11") +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(limits = c(0, 13), breaks = seq(0, 13, by = 1)) +  # adjusting the x-axis
    scale_y_continuous(limits = c(0, 1)) # adjusting the y-axis
cross_val_plot

In [None]:

# build model with k=11
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 11) %>%
    set_engine("kknn") %>%
    set_mode("classification")

# new workflow
fit <- workflow() %>%
    add_recipe(cp_recipe) %>%
    add_model(knn_spec) %>%
    fit(data=chest_pain_data_train)

# predict
cp_predictions <- predict(fit, chest_pain_data_test) %>%
    bind_cols(chest_pain_data_test)

# compute accuracy
metrics <- cp_predictions %>%
    metrics(truth = chest_pain_type, estimate = .pred_class)
metrics

# confusion matrix
confusion <- cp_predictions %>%
    conf_mat(truth = chest_pain_type, estimate = .pred_class)
confusion

In [None]:
# recipe and standardization
cp_recipe <- recipe(chest_pain_type~trest_bps+max_heart_rate, data = chest_pain_data_train) %>%
# cp_recipe <- recipe(chest_pain_type~age+trest_bps+cholesterol+max_heart_rate, data = chest_pain_data_train) %>%
#    step_upsample(chest_pain_type, over_ratio = 1) %>%         # fix imbalances
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

# workflow
knn_result <- workflow() %>%
    add_recipe(cp_recipe) %>%
    add_model(knn_tune) %>%
    tune_grid(resamples = vfold, grid = 50) %>%
    collect_metrics()
slice (knn_result, 1:6)

# accuracy
accuracies <- knn_result %>%
    filter(.metric == "accuracy")

options(repr.plot.width = 7, repr.plot.height = 6)
# accuracy plot
cross_val_plot <- accuracies %>%
    ggplot(aes(x=neighbors, y=mean)) +
    geom_point()+
    geom_line() +
    labs(x="Neighbors", y="Accuracy Estimate") +
    ggtitle("n = 11") +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(limits = c(0, 13), breaks = seq(0, 13, by = 1)) +  # adjusting the x-axis
    scale_y_continuous(limits = c(0, 1)) # adjusting the y-axis
cross_val_plot

In [None]:
# build model with k=11
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 11) %>%
    set_engine("kknn") %>%
    set_mode("classification")

# new workflow
fit <- workflow() %>%
    add_recipe(cp_recipe) %>%
    add_model(knn_spec) %>%
    fit(data=chest_pain_data_train)

# predict
cp_predictions <- predict(fit, chest_pain_data_test) %>%
    bind_cols(chest_pain_data_test)

# compute accuracy
metrics <- cp_predictions %>%
    metrics(truth = chest_pain_type, estimate = .pred_class)
metrics

# confusion matrix
confusion <- cp_predictions %>%
    conf_mat(truth = chest_pain_type, estimate = .pred_class)
confusion