In [9]:
library(tidyverse)
library(tidymodels)
library(repr)
library(RColorBrewer)
# library(themis)

In [10]:
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
download.file(url, "data.mod")

# Load our dataset
chest_pain_data <- read_csv("data.mod", col_name = FALSE)
slice(chest_pain_data,1:6)

Parsed with column specification:
cols(
  X1 = [32mcol_double()[39m,
  X2 = [32mcol_double()[39m,
  X3 = [32mcol_double()[39m,
  X4 = [32mcol_double()[39m,
  X5 = [32mcol_double()[39m,
  X6 = [32mcol_double()[39m,
  X7 = [32mcol_double()[39m,
  X8 = [32mcol_double()[39m,
  X9 = [32mcol_double()[39m,
  X10 = [32mcol_double()[39m,
  X11 = [32mcol_double()[39m,
  X12 = [31mcol_character()[39m,
  X13 = [31mcol_character()[39m,
  X14 = [32mcol_double()[39m
)



X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0


In [11]:
# add colomn names
names(chest_pain_data) <- c("age", "sex",
                         "chest_pain_type",
                         "trest_bps",
                          "cholesterol",
                          "fasting_blood_sugar",
                          "resting_ecg",
                          "max_heart_rate",
                          "exercise_induced_angina",
                          "oldpeak",
                          "slope",
                          "no_vessels_colored",
                          "thal",
                          "healthy")

In [12]:
# convert all categorical variables into a factor using the as_factor() function.
chest_pain_data <- mutate(chest_pain_data, sex = as_factor(sex),
                          chest_pain_type = as_factor(chest_pain_type),
                          fasting_blood_sugar = as_factor(fasting_blood_sugar),
                          resting_ecg = as_factor(resting_ecg),
                          exercise_induced_angina = as_factor(exercise_induced_angina),
                          slope = as_factor(slope),
                          no_vessels_colored = as_factor(no_vessels_colored),
                          thal = as_factor(thal),
                          healthy = as_factor(healthy)
                          )
# slice(chest_pain_data,1:6) 

In [16]:
# split our dataset into a training dataset and a testing dataset
# Randomly take 75% of the data in the training set. 
# This will be proportional to the different number of fruit names in the dataset.

chest_pain_data_split <- initial_split(chest_pain_data, prop = 0.75, strata =chest_pain_type )  
chest_pain_data_train <- training(chest_pain_data_split)   
chest_pain_data_test <- testing(chest_pain_data_split)

 slice(chest_pain_data_train,1:6) 
 slice(chest_pain_data_test,1:6) 

age,sex,chest_pain_type,trest_bps,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_induced_angina,oldpeak,slope,no_vessels_colored,thal,healthy
<dbl>,<fct>,<fct>,<dbl>,<dbl>,<fct>,<fct>,<dbl>,<fct>,<dbl>,<fct>,<fct>,<fct>,<fct>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
63,1,4,130,254,0,2,147,0,1.4,2,1.0,7.0,2
53,1,4,140,203,1,2,155,1,3.1,3,0.0,7.0,1
56,1,3,130,256,1,2,142,1,0.6,2,1.0,6.0,2


age,sex,chest_pain_type,trest_bps,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_induced_angina,oldpeak,slope,no_vessels_colored,thal,healthy
<dbl>,<fct>,<fct>,<dbl>,<dbl>,<fct>,<fct>,<dbl>,<fct>,<dbl>,<fct>,<fct>,<fct>,<fct>
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0
62,0,4,140,268,0,2,160,0,3.6,3,2.0,3.0,3
57,0,4,120,354,0,0,163,1,0.6,1,0.0,3.0,0
57,1,4,140,192,0,0,148,0,0.4,2,0.0,6.0,0


In [17]:
# cross validation
vfold <- vfold_cv(chest_pain_data_train, v=5, strata = chest_pain_type)

In [18]:
# recipe and standardization
cp_recipe <- recipe(chest_pain_type~age+trest_bps+cholesterol+max_heart_rate, data = chest_pain_data_train) %>%
    step_upsample(Class, over_ratio = 1, skip = FALSE) %>%         # fix imbalances
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

# model
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("classification")

# workflow
knn_result <- workflow() %>%
    add_recipe(cp_recipe) %>%
    add_model(knn_tune) %>%
    tune_grid(resamples = vfold, grid = 50) %>%
    collect_metrics()

# accuracy
accuracies <- knn_result %>%
    filter(.metric == "accuracy")

# accuracy plot
cross_val_plot <- accuracies %>%
    ggplot(aes(x=neighbors, y=mean)) +
    geom_point()+
    geom_line() +
    labs(x="Neighbors", y="Accuracy Estimate")
cross_val_plot

# build model with k=12
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 11) %>%
    set_engine("kknn") %>%
    set_mode("classification")

# new workflow
fit <- workflow() %>%
    add_recipe(cp_recipe) %>%
    add_model(knn_spec) %>%
    fit(data=chest_pain_data_train)

# predict
cp_predictions <- predict(fit, chest_pain_data_test) %>%
    bind_cols(chest_pain_data_test)

# compute accuracy
metrics <- cp_predictions %>%
    metrics(truth = chest_pain_type, estimate = .pred_class)
metrics

# confusion matrix
confusion <- cp_predictions %>%
    conf_mat(truth = chest_pain_type, estimate = .pred_class)
confusion

[31mx[39m [31mFold1: recipe: Error: Can't subset columns that don't exist.
[31m✖[31m Column...[39m

[31mx[39m [31mFold2: recipe: Error: Can't subset columns that don't exist.
[31m✖[31m Column...[39m

[31mx[39m [31mFold3: recipe: Error: Can't subset columns that don't exist.
[31m✖[31m Column...[39m

[31mx[39m [31mFold4: recipe: Error: Can't subset columns that don't exist.
[31m✖[31m Column...[39m

[31mx[39m [31mFold5: recipe: Error: Can't subset columns that don't exist.
[31m✖[31m Column...[39m

“All models failed in tune_grid(). See the `.notes` column.”


ERROR: Error: All of the models failed. See the .notes column.
