# Supervised Machine Learning - Test Your Knowledge Key

In [2]:
install.packages(c("tidyverse", "gtsummary", "caret", "e1071", "Hmsic", "randomForest", "pROC", "themis"))

“package ‘Hmsic’ is not available for this version of R

A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages”
also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘globals’, ‘shape’, ‘future.apply’, ‘progressr’, ‘SQUAREM’, ‘diagram’, ‘KernSmooth’, ‘lava’, ‘V8’, ‘reactR’, ‘rpart’, ‘survival’, ‘nnet’, ‘prodlim’, ‘vroom’, ‘tzdb’, ‘labelled’, ‘bigD’, ‘bitops’, ‘commonmark’, ‘juicyjuice’, ‘markdown’, ‘reactable’, ‘codetools’, ‘iterators’, ‘clock’, ‘ipred’, ‘timeDate’, ‘haven’, ‘readr’, ‘broom.helpers’, ‘gt’, ‘foreach’, ‘ModelMetrics’, ‘plyr’, ‘recipes’, ‘reshape2’, ‘class’, ‘proxy’, ‘Rcpp’, ‘gower’, ‘RANN’, ‘ROSE’, ‘hardhat’


“installation of package ‘tzdb’ had non-zero exit status”
“installation of package ‘vroom’ had non-zero exit status”
“installation of package ‘clock’ had non-zero exit status”
“installation of package ‘readr’ had non-zero exit sta

In [1]:
library(readxl);
library(lubridate);
library(tidyverse);
library(gtsummary);
# # library(caret);
# library(e1071);
# library(Hmisc);
library(randomForest);
# library(pROC);
# library(themis);


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




ERROR: Error in library(tidyverse): there is no package called ‘tidyverse’


In [None]:
# Load the data
chromium_data <- data.frame(read_excel("Module5/Module5_Chromium_Data.xlsx"))

# View the top of the dataset
head(chromium_data) 

Like we did in the module, we'll start by changing some of the data types. 

In [None]:
chromium_data = chromium_data %>%
    # Converting `Detect_Concentration from a character to a factor
    mutate(Detect_Concentration = relevel(factor(ifelse(Detect_Concentration == "D", 1, 0)), ref = "0"),
        # converting water sample date from a character to a date type 
        Water_Sample_Date = mdy(Water_Sample_Date))

head(chromium_data)

Testing for differences in predictor variables acrosss the outcome classes.

In [None]:
chromium_data %>%
    tbl_summary(by = Detect_Concentration,
    # Selecting columns to include
    include = colnames(chromium_data[c(2:8)]), 
    # Displaying the mean and standard deviation in parantheses for all continuous variables
                statistic = list(all_continuous() ~ "{mean} ({sd})")) %>%
    # Adding a column that displays the total number of samples for each variable
    # This will be 713 for all variables since we have no missing data
    add_n() %>% 
    # Adding a column that dispalys the p value from anova
    add_p(test = list(all_continuous() ~ "aov")) %>% 
    as_flex_table() %>%
    bold(bold = TRUE, part = "header")

Setting up cross validation and parameters to be tuned.

In [None]:
set.seed(12)

chromium_index = createFolds(chromium_data$Detect_Concentration, k = 5) 

ntree_values = c(50, 250, 500) # number of trees 
p = dim(well_data)[2] - 1 # number of predictor variables in the dataset
mtry_values = c(sqrt(p), p/2, p) # number of predictors to be used in the model

Predicting with RF

In [None]:
# Setting the seed again so the predictions are consistent
set.seed(12)

# Creating an empty dataframe to save the metrics
metrics_df = data.frame()

# Iterating through the cross validation folds
for (i in 1:length(chromium_index)){
    # Training data
    data_train = chromium_data[-chromium_index[[i]],]
    
    # Test data
    data_test = chromium_data[chromium_index[[i]],]
    
    # Creating empty lists and dataframes to store errors 
    reg_rf_pred_tune = list()
    rf_OOB_errors = list()
    rf_error_df = data.frame()
    
    # Tuning parameters: using ntree and mtry values to determine which combination yields the smallest OOB error 
    # from the validation datasets
    for (j in 1:length(ntree_values)){
        for (k in 1:length(mtry_values)){
            
            # Running RF to tune parameters
            reg_rf_pred_tune[[k]] = randomForest(Detect_Concentration ~ ., data = data_train, 
                                                 ntree = ntree_values[j], mtry = mtry_values[k])
            # Obtaining the OOB error
            rf_OOB_errors[[k]] = data.frame("Tree Number" = ntree_values[j], "Variable Number" = mtry_values[k], 
                                   "OOB_errors" = reg_rf_pred_tune[[k]]$err.rate[ntree_values[j],1])
            
            # Storing the values in a dataframe
            rf_error_df = rbind(rf_error_df, rf_OOB_errors[[k]])
        }
    }
    
    # Finding the lowest OOB error using best number of predictors at split
    best_oob_errors <- which(rf_error_df$OOB_errors == min(rf_error_df$OOB_errors))

    # Now running RF on the entire training set with the tuned parameters
    reg_rf <- randomForest(Detect_Concentration ~ ., data = data_train,
                               ntree = rf_error_df$Tree.Number[min(best_oob_errors)],
                               mtry = rf_error_df$Variable.Number[min(best_oob_errors)])

    # Predicting on test set and adding the predicted values as an additional column to the test data
    data_test$Pred_Detect_Concentration = predict(reg_rf, newdata = data_test, type = "response")

    # Obtaining the confusion matrix
    matrix = confusionMatrix(data = data_test$Pred_Detect_Concentration, 
                             reference = data_test$Detect_Concentration, positive = "1")
    
    # Extracting balanced accuracy, sensitivity, specificity, and PPV
    matrix_values = data.frame(t(c(matrix$byClass[11])), t(c(matrix$byClass[1:3])))
    
    # Adding values to df to be averaged across the 5 splits from CV
    metrics_df = rbind(metrics_df, matrix_values)
}

# Taking average
metrics_df = metrics_df %>%
    summarise(`Balanced Accuracy` = mean(Balanced.Accuracy), Sensitivity = mean(Sensitivity), 
          Specificity = mean(Specificity), PPV = mean(Pos.Pred.Value))

# Viewing the model's performance metrics
metrics_df