In [None]:
library(tidyverse)
library(repr)
library(tidymodels)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [None]:
set.seed(209)
#reading the dataset from the URL
heart_disease_data <- read_csv("https://raw.githubusercontent.com/QiXu2022/dsci-100-project-Group_66/main/heart_disease_uci.csv")
heart_disease_data <- mutate(heart_disease_data, restecg = as_factor(restecg))

In [None]:
#tidying the data
tidy_data<-heart_disease_data |>
    filter(dataset=="Cleveland") |>
    select(age, trestbps, chol, restecg) 
tidy_data

In [None]:
heart_split<- initial_split(tidy_data, prop=0.75, strata=restecg)
heart_train<- training(heart_split)
heart_test<-testing(heart_split)

#### Firstly, we build model of using age, trestbps and chol to predict restecg by k-nn classification.


In [None]:
set.seed(2020)
## 5-fold cross validation
heart_recipe_all = recipe(restecg ~ ., data = heart_train) |>
                step_center(all_predictors()) |>
                step_scale(all_predictors())
heart_spec = nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")
heart_vf <- vfold_cv(heart_train, v = 5, strata = restecg)

heart_results_all = workflow() |>
    add_recipe(heart_recipe_all) |>
    add_model(heart_spec)|>
    tune_grid(resamples = heart_vf, grid = 15) |>
    collect_metrics()

heart_accuracies_all <- heart_results_all |> 
       filter(.metric == "accuracy", .estimator == "multiclass")

heart_accuracies_all

heart_accuracy_plot_all <- ggplot(heart_accuracies_all, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(1, 16, by = 1)) +  # adjusting the x-axis
       scale_y_continuous(limits = c(0.4, 1)) # adjusting the y-axis

heart_accuracy_plot_all

According to the plot above, we choose k = 14.

In [1]:
heart_retrain_spec_all = nearest_neighbor(weight_func = "rectangular", neighbors = 14) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_fit_all = workflow() |>
       add_recipe(heart_recipe_all) |>
       add_model(heart_retrain_spec_all) |>
       fit(data = heart_train)

heart_predictions_all <- predict(heart_fit_all , heart_test) |>
                    bind_cols(heart_test)

heart_predictions_all

heart_metrics_all <- heart_predictions_all |>
         metrics(truth = restecg, estimate = .pred_class)
heart_metrics_all

heart_conf_mat_all <- heart_predictions_all |> 
       conf_mat(truth = restecg, estimate = .pred_class)
heart_conf_mat_all





ERROR: Error in set_mode(set_engine(nearest_neighbor(weight_func = "rectangular", : could not find function "set_mode"


#### let's do the second model using age, trestbps to predict restecg by k-nn classification.

In [None]:
set.seed(2020)
## 5-fold cross validation
heart_recipe_at = recipe(restecg ~ age + trestbps, data = heart_train) |>
                step_center(all_predictors()) |>
                step_scale(all_predictors())

heart_results_at = workflow() |>
    add_recipe(heart_recipe_at) |>
    add_model(heart_spec)|>
    tune_grid(resamples = heart_vf, grid = 15) |>
    collect_metrics()

heart_accuracies_at <- heart_results_at |> 
       filter(.metric == "accuracy", .estimator == "multiclass")

heart_accuracies_at

heart_accuracy_plot_at <- ggplot(heart_accuracies_at, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(1, 16, by = 1)) +  # adjusting the x-axis
       scale_y_continuous(limits = c(0.4, 1)) # adjusting the y-axis

heart_accuracy_plot_at

According to the plot above, we choose k = 9

In [None]:
heart_retrain_spec_at = nearest_neighbor(weight_func = "rectangular", neighbors = 9) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_fit_at = workflow() |>
       add_recipe(heart_recipe_at) |>
       add_model(heart_retrain_spec_at) |>
       fit(data = heart_train)

heart_predictions_at <- predict(heart_fit_at , heart_test) |>
                    bind_cols(heart_test)
heart_predictions_at

heart_metrics_at <- heart_predictions_at |>
         metrics(truth = restecg, estimate = .pred_class)
heart_metrics_at

heart_conf_mat_at <- heart_predictions_at |> 
       conf_mat(truth = restecg, estimate = .pred_class)
heart_conf_mat_at


#### let's do the third model using age, chol to predict restecg by k-nn classification.

In [None]:
set.seed(2020)
## 5-fold cross validation
heart_recipe_ac = recipe(restecg ~ age + chol, data = heart_train) |>
                step_center(all_predictors()) |>
                step_scale(all_predictors())

heart_results_ac = workflow() |>
    add_recipe(heart_recipe_ac) |>
    add_model(heart_spec)|>
    tune_grid(resamples = heart_vf, grid = 15) |>
    collect_metrics()

heart_accuracies_ac <- heart_results_ac |> 
       filter(.metric == "accuracy", .estimator == "multiclass")

heart_accuracies_ac

heart_accuracy_plot_ac <- ggplot(heart_accuracies_ac, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(1, 16, by = 1)) +  # adjusting the x-axis
       scale_y_continuous(limits = c(0.4, 1)) # adjusting the y-axis

heart_accuracy_plot_ac

According to the plot above, we choose k = 5.

In [2]:
heart_retrain_spec_ac = nearest_neighbor(weight_func = "rectangular", neighbors = 5) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_fit_ac = workflow() |>
       add_recipe(heart_recipe_ac) |>
       add_model(heart_retrain_spec_ac) |>
       fit(data = heart_train)

heart_predictions_ac <- predict(heart_fit_ac , heart_test) |>
                    bind_cols(heart_test)
heart_predictions_ac

heart_metrics_ac <- heart_predictions_ac |>
         metrics(truth = restecg, estimate = .pred_class)
heart_metrics_ac

heart_conf_mat_ac <- heart_predictions_ac |> 
       conf_mat(truth = restecg, estimate = .pred_class)
heart_conf_mat_ac

ERROR: Error in set_mode(set_engine(nearest_neighbor(weight_func = "rectangular", : could not find function "set_mode"


#### Then the final model using trestbps, chol to predict restecg by k-nn classification.

In [None]:
set.seed(2020)
## 5-fold cross validation
heart_recipe_tc = recipe(restecg ~ trestbps + chol, data = heart_train) |>
                step_center(all_predictors()) |>
                step_scale(all_predictors())

heart_results_tc = workflow() |>
    add_recipe(heart_recipe_tc) |>
    add_model(heart_spec)|>
    tune_grid(resamples = heart_vf, grid = 15) |>
    collect_metrics()

heart_accuracies_tc <- heart_results_tc |> 
       filter(.metric == "accuracy", .estimator == "multiclass")

heart_accuracies_tc

heart_accuracy_plot_tc <- ggplot(heart_accuracies_tc, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(1, 16, by = 1)) +  # adjusting the x-axis
       scale_y_continuous(limits = c(0.4, 1)) # adjusting the y-axis

heart_accuracy_plot_tc

According to the plot above, we choose k = 11.

In [None]:
heart_retrain_spec_tc = nearest_neighbor(weight_func = "rectangular", neighbors = 11) |>
    set_engine("kknn") |>
    set_mode("classification")

heart_fit_tc = workflow() |>
       add_recipe(heart_recipe_tc) |>
       add_model(heart_retrain_spec_tc) |>
       fit(data = heart_train)

heart_predictions_tc <- predict(heart_fit_tc , heart_test) |>
                    bind_cols(heart_test)
heart_predictions_tc

heart_metrics_tc <- heart_predictions_tc |>
         metrics(truth = restecg, estimate = .pred_class)
heart_metrics_tc

heart_conf_mat_tc <- heart_predictions_tc |> 
       conf_mat(truth = restecg, estimate = .pred_class)
heart_conf_mat_tc

#### Compare the accuracy rate of the four models (0.4285714, 0.5584416, 0.48051948, 0.4935064935), we found the model of using age, trestbps to predict restecg has the highest accuracy. We think this model is the best model to predict restecg.

### Discussion:

We found that Age and trestbps are two factors which are the most associated with heart disease.