In [6]:
Output = ('/Users/alexis/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/CEMALB_DataAnalysisPM/Projects/P1011. NC Well Metals/P1011.2. Analyses/P1011.2.2. Metal Prediction/Output')
cur_date = "040624"

library(readxl)
library(writexl)
library(openxlsx)
library(lubridate)
library(tidyverse)
library(gtsummary)
library(caret)
library(e1071)
library(Hmisc)
library(randomForest)
library(pROC)
library(themis)

# reading in file
well_data = data.frame(read_excel("Input/Imputed_Well_Data_020924.xlsx")) 

In [7]:
head(well_data)

Unnamed: 0_level_0,Tax_ID,Health_Dept_ID,Permit_No,Water_Sample_Date,Casing_Depth,Well_Depth,Static_Water_Depth,Flow_Rate,pH,Metal,Longitude,Latitude,Geologic_Terrane,Rock_Type,Soil_Type_Condensed,Landuse_Condensed,Elevation,Stream_Distance,Concentration,Detect_Concentration
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>
1,4006015,344,09-192,,97,450,35,20.0,7.4,Ar,-80.55427,34.87224,CSB,MV,H,C,675.174,654.6816,2.007062,ND
2,04009002C,1525,14-18,12/7/10,65,300,32,2.0,7.0,Ar,-80.55676,34.87109,CSB,MV,D,C,678.113,454.1912,1.912321,ND
3,04030015C,1525,14-18,8/27/12,65,300,32,2.0,8.1,Ar,-80.55676,34.87109,CSB,MV,D,C,469.708,454.1912,7.0,D
4,04030015J,1525,14-18,4/5/10,65,300,32,2.0,8.1,Ar,-80.55676,34.87109,CSB,MV,H,F,470.293,454.1912,1.153921,ND
5,04030020H,234,09-147,10/25/10,52,125,36,20.0,7.6,Ar,-80.5522,34.86012,CSB,MV,H,F,470.293,918.3419,1.67536,ND
6,4030041,1515,14-04,3/2/16,47,275,34,2.5,8.2,Ar,-80.56423,34.88559,CSB,MV,H,D,470.293,512.3955,14.0,D


## Metal Correlation

In [3]:
# assessing collinearity between the metals
as_df = well_data %>% 
    filter(Metal == "Ar")
mn_df = well_data %>% 
    filter(Metal == "Mn")
cr_df = well_data %>% 
    filter(Metal == "Cr")

cor.test(as_df$Concentration, mn_df$Concentration)
cor.test(as_df$Concentration, cr_df$Concentration)
cor.test(mn_df$Concentration, cr_df$Concentration)


	Pearson's product-moment correlation

data:  as_df$Concentration and mn_df$Concentration
t = -0.058171, df = 713, p-value = 0.9536
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.07548740  0.07115379
sample estimates:
         cor 
-0.002178519 



	Pearson's product-moment correlation

data:  as_df$Concentration and cr_df$Concentration
t = -0.48146, df = 713, p-value = 0.6303
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.09122823  0.05536624
sample estimates:
        cor 
-0.01802788 



	Pearson's product-moment correlation

data:  mn_df$Concentration and cr_df$Concentration
t = -1.3792, df = 713, p-value = 0.1683
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.12443367  0.02182011
sample estimates:
        cor 
-0.05158335 


None of the metals are highly correlated. 

# Prediction of Contaminated Wells using Supervised ML

Using `Water_Sample_Date`, `Casing_Depth`, `Well_Depth`, `Static_Water_Depth`, `pH`, `Flow_Rate`, `Stream_Distance`, `Elevation`, `Geologic_Terrane` or `Rock_Type`, `Soil_Type_Condensed`, `Landuse_Condensed`, `Latitude`, and `Longitude` to predict whether Mn concentration falls above or below the EPA's lifetime Health Advisory Limit (HAL) (< or >= 300 ppb). RF and SVM models will be built to predict concentration this binarized concentration. 

Additionally, models will be run using a different combination of the aforementioned predictors detailed into the following use cases below:

1. All Data (using all 13 features)
2. All variables excluding latitude and longitude (Using this to see if the prediction's accuracy is retained removing those coordinates to make the results more generalizable to areas outside of Union County, NC.)
3. Well Chacteristics only (casing depth, pH, flow rate, well depth, and static water depth)
4. Health Department lens (rock type, latitude, and longitude)

Starting by creating an additional variable for above and below 300 ppb and formatting the df for input into ML models.

In [3]:
manganese_data = well_data %>%
    mutate(MCL = relevel(factor(ifelse(Concentration >= 300, 1, 0)), ref = "0"),
          # converting water sample date from a character to a date type 
          Water_Sample_Date = mdy(Water_Sample_Date)) %>%
    # filtering for manganese only and removing some rows with missing data
    filter(Metal == "Mn" & Landuse_Condensed != "NA") %>%
    select(-c("Metal", "Detect_Concentration"))

# changing data types
manganese_data$Geologic_Terrane = factor(manganese_data$Geologic_Terrane)
manganese_data$Rock_Type = factor(manganese_data$Rock_Type)
manganese_data$Soil_Type_Condensed = factor(manganese_data$Soil_Type_Condensed)
manganese_data$Landuse_Condensed = factor(manganese_data$Landuse_Condensed)

head(manganese_data)

[1m[22m[36mℹ[39m In argument: `Water_Sample_Date = mdy(Water_Sample_Date)`.
[33m![39m  6 failed to parse.”


Unnamed: 0_level_0,Tax_ID,Health_Dept_ID,Permit_No,Water_Sample_Date,Casing_Depth,Well_Depth,Static_Water_Depth,Flow_Rate,pH,Longitude,Latitude,Geologic_Terrane,Rock_Type,Soil_Type_Condensed,Landuse_Condensed,Elevation,Stream_Distance,Concentration,MCL
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<fct>
1,04009002C,344,09-192,2010-12-07,97,450,35,20.0,7.4,-80.55427,34.87224,CSB,MV,H,C,678.113,654.6816,21.47573,0
2,04030015C,1525,14-18,2012-08-27,65,300,32,2.0,7.0,-80.55676,34.87109,CSB,MV,D,C,469.708,454.1912,23.35497,0
3,04030015J,1525,14-18,2010-04-05,65,300,32,2.0,8.1,-80.55676,34.87109,CSB,MV,D,C,470.293,454.1912,1300.0,1
4,04030020H,1525,14-18,2010-10-25,65,300,32,2.0,8.1,-80.55676,34.87109,CSB,MV,H,F,470.293,454.1912,850.0,1
5,4030041,234,09-147,2016-03-02,52,125,36,20.0,7.6,-80.5522,34.86012,CSB,MV,H,F,470.293,918.3419,22.76362,0
6,4033001,1515,14-04,2011-12-06,47,275,34,2.5,8.2,-80.56423,34.88559,CSB,MV,H,D,451.806,512.3955,90.0,0


In [4]:
# original number of records
dim(well_data)

# records kept for analysis
dim(manganese_data)

# Summary Statistics
Determining if there are any signficiant differenes between the predictor variables for each outcome variable.

In [8]:
manganese_data %>%
  tbl_summary(by = MCL, missing = "no", 
  include = colnames(manganese_data[c(4:17)]), 
              statistic = list(all_continuous() ~ "{mean} ({sd})",
                               all_categorical() ~ "{n} ({p}%)")) %>%
  add_n() %>% 
  #add_overall() %>%
  add_p(test = list(all_continuous() ~ "t.test",
                    all_categorical() ~ "chisq.test")) %>% # adding p value from anova
  as_tibble() %>%
  write_xlsx(., "Output/Table1b.xlsx")

There was an error in 'add_p()/add_difference()' for variable 'Water_Sample_Date', p-value omitted:
Error in Math.Date(mx): abs not defined for "Date" objects






Although the p values are significant between some of the variables, there is a high level of class imbalance which is likely to affect model performance. Therefore, class imbalance will be addressed using SMOTE. 

In [7]:
# creating dfs for each use case
# dropped 2 rows that had missing dates
manganese_case_1_df = drop_na(manganese_data[,c(4:11,13:17,19)])
manganese_case_2_df = drop_na(manganese_data[,c(4:9,13:17,19)])
manganese_case_3_df = drop_na(manganese_data[,c(5:9,19)])
manganese_case_4_df = drop_na(manganese_data[,c(10,11,13,19)])

head(manganese_case_1_df)

Unnamed: 0_level_0,Water_Sample_Date,Casing_Depth,Well_Depth,Static_Water_Depth,Flow_Rate,pH,Longitude,Latitude,Rock_Type,Soil_Type_Condensed,Landuse_Condensed,Elevation,Stream_Distance,MCL
Unnamed: 0_level_1,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<fct>
1,2010-12-07,97,450,35,20.0,7.4,-80.55427,34.87224,MV,H,C,678.113,654.6816,0
2,2012-08-27,65,300,32,2.0,7.0,-80.55676,34.87109,MV,D,C,469.708,454.1912,0
3,2010-04-05,65,300,32,2.0,8.1,-80.55676,34.87109,MV,D,C,470.293,454.1912,1
4,2010-10-25,65,300,32,2.0,8.1,-80.55676,34.87109,MV,H,F,470.293,454.1912,1
5,2016-03-02,52,125,36,20.0,7.6,-80.5522,34.86012,MV,H,F,470.293,918.3419,0
6,2011-12-06,47,275,34,2.5,8.2,-80.56423,34.88559,MV,H,D,451.806,512.3955,0


# Random Forest
- an ensemble learning method operating by constructing a multitude of decision trees at training time, which uses multiple methods to obtain a better predictive performance and includes bagging and random forest
- algorithm uses a bootstrop aggregation, to reduce overfitting the training datset but only a subset of the features are used hence decorrelation of predictors

In [8]:
rf_classification = function(dataset, outcome, pred_outcome, use_case){
    # setting for reproducibility
    set.seed(12)
    # splitting data into training and testing sets
    dataset_index = createFolds(dataset[[outcome]], k = 5) # 5 fold CV
    
    ntree_values = c(50, 250, 500) # number of trees 
    p = dim(dataset)[2] - 1 # number of variables in dataset
    mtry_values = c(sqrt(p), p/2, p/3) # number of predictors

    metrics = data.frame()
    variable_importance_df = data.frame()
    roc_objects = c()
    threshold_data = data.frame()
    
    for (i in 1:length(dataset_index)){
        
        data_train = dataset[-dataset_index[[i]],]
        # using SMOTE to balance the class distribution
        balanced_data_train = smotenc(data_train, outcome)
        data_test = dataset[dataset_index[[i]],]

        # will use ntree and mtry values to determine which combination yields the smallest MSE
        reg_rf_pred_tune = list()
        rf_OOB_errors = list()
        rf_error_df = data.frame()
        for (j in 1:length(ntree_values)){
            for (k in 1:length(mtry_values)){
                reg_rf_pred_tune[[k]] = randomForest(as.formula(paste0(outcome, "~.")), data = balanced_data_train, 
                                                     ntree = ntree_values[j], mtry = mtry_values[k])
                rf_OOB_errors[[k]] = data.frame("Tree Number" = ntree_values[j], "Variable Number" = mtry_values[k], 
                                       "OOB_errors" = reg_rf_pred_tune[[k]]$err.rate[ntree_values[j],1])
                rf_error_df = rbind(rf_error_df, rf_OOB_errors[[k]])
            }
        }

        # finding the lowest OOB error using best number of predictors at split and refitting OG tree
        best_oob_errors <- which(rf_error_df$OOB_errors == min(rf_error_df$OOB_errors))

        reg_rf <- randomForest(as.formula(paste0(outcome, "~.")), data = balanced_data_train,
                               ntree = rf_error_df$Tree.Number[min(best_oob_errors)],
                               mtry = rf_error_df$Variable.Number[min(best_oob_errors)])

        # predicting on test set
        data_test[[pred_outcome]] = predict(reg_rf, newdata = data_test, type = "response")
        
        matrix = confusionMatrix(data = data_test[[pred_outcome]], reference = data_test[[outcome]], 
                                     positive = "1")

        # calculating AUC
        auc = auc(response = data_test[[outcome]], predictor = factor(data_test[[pred_outcome]], ordered = TRUE))
        
        # calculating values to plot ROC curve later
        roc_obj = roc(response = data_test[[outcome]], predictor = factor(data_test[[pred_outcome]], ordered = TRUE))

        # Return max Youden's index, with specificity and sensitivity
        best_thres_data = data.frame(coords(roc_obj, x = "best", best.method = c("youden", "closest.topleft")))
        threshold_data = rbind(threshold_data, best_thres_data)
        
        # extracting accuracy, lower CI, upper CI, sens, spec, PPV, NPV to take mean later
        matrix_values = data.frame(t(c(matrix$overall[c(1,3,4)])), t(c(matrix$byClass[11])), 
                                   t(c(matrix$byClass[1:4])), auc)
        
        # extracting variable importance
        var_importance_values = data.frame(importance(reg_rf)) %>%
            rownames_to_column(var = "Predictor")
        variable_importance_df = rbind(variable_importance_df, var_importance_values)
   
        # adding values to df
        metrics = rbind(metrics, matrix_values)
        
    }
    
    # taking averages/sd 
    metrics = metrics %>%
        summarise(Accuracy = mean(Accuracy), LowerCI = mean(AccuracyLower), UpperCI = mean(AccuracyUpper),
                  Sensitivity = mean(Sensitivity), Specificity = mean(Specificity), PPV = mean(Pos.Pred.Value), 
                  NPV = mean(Neg.Pred.Value), AUC = mean(auc))
    
    variable_importance_df = variable_importance_df %>%
        group_by(Predictor) %>%
        summarise(MeanDecreaseGini = mean(MeanDecreaseGini)) %>%
        # sorting by most important variables
        arrange(-MeanDecreaseGini) %>%
        mutate(Use_Case = use_case)
  
    # return training set, matrix, variable importance values, (last) roc object, best threshold data
    return(list(balanced_data_train, metrics, variable_importance_df, roc_obj, threshold_data))

}

In [9]:
# calling fn
rf_values_manganese_case_1 = rf_classification(manganese_case_1_df, "MCL", "pred_MCL",1)
rf_values_manganese_case_2 = rf_classification(manganese_case_2_df, "MCL", "pred_MCL",2)
rf_values_manganese_case_3 = rf_classification(manganese_case_3_df, "MCL", "pred_MCL",3)
rf_values_manganese_case_4 = rf_classification(manganese_case_4_df, "MCL", "pred_MCL",4)

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting leve

In [10]:
# viewing results
rf_confusion_matrix = data.frame(Model = "RF Classification", 
                                 Use_Case = 1:4,
                                 Kernel = NA, rbind(rf_values_manganese_case_1[[2]], rf_values_manganese_case_2[[2]],
                                 rf_values_manganese_case_3[[2]],rf_values_manganese_case_4[[2]])) 

rf_confusion_matrix

# viewing most significant features
rf_values_manganese_case_1[[3]]

Model,Use_Case,Kernel,Accuracy,LowerCI,UpperCI,Sensitivity,Specificity,PPV,NPV,AUC
<chr>,<int>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
RF Classification,1,,0.8539911,0.7852977,0.9073309,0.4895425,0.9054323,0.4279656,0.9269879,0.6974874
RF Classification,2,,0.8371091,0.7661303,0.8935161,0.3751634,0.9022323,0.3418386,0.9116613,0.6386978
RF Classification,3,,0.8417182,0.7714723,0.8972241,0.351634,0.9105905,0.3668854,0.9091508,0.6311122
RF Classification,4,,0.8262848,0.7540981,0.8844357,0.5313725,0.8673905,0.3579872,0.9302196,0.6993815


Predictor,MeanDecreaseGini,Use_Case
<chr>,<dbl>,<dbl>
Longitude,127.10061,1
Casing_Depth,65.64029,1
Latitude,54.52511,1
Flow_Rate,35.66445,1
pH,35.43432,1
Well_Depth,33.21099,1
Elevation,29.94335,1
Static_Water_Depth,27.33723,1
Water_Sample_Date,26.62692,1
Stream_Distance,24.93286,1


Rerunning RF model with noise variables to determine, which predictors are significant above the background noise.

In [11]:
train_vars_noise_manganese_case_1 = rf_values_manganese_case_1[[1]]

noise_df = function(train_vars_noise){
    set.seed(8)
    # Add random noise predictors as an additional method to evaluate model performance
    # Adding a column that contains randomly shuffled values from one of the molecules; sampling with replacement
    train_vars_noise$noise1 = sample(train_vars_noise[[colnames(train_vars_noise[10])]], replace = TRUE) 
    train_vars_noise$noise2 = sample(train_vars_noise[[colnames(train_vars_noise[2])]], replace = TRUE)
    train_vars_noise$noise3 = sample(train_vars_noise[[colnames(train_vars_noise[5])]], replace = TRUE)
    train_vars_noise$noise4 = sample(train_vars_noise[[colnames(train_vars_noise[6])]], replace = TRUE)
    train_vars_noise$noise5 = sample(train_vars_noise[[colnames(train_vars_noise[9])]], replace = TRUE)
    
    return(train_vars_noise)
}

# calling fn
noise_training_rf_dataset_manganese_case_1 = noise_df(train_vars_noise_manganese_case_1)

head(noise_training_rf_dataset_manganese_case_1)

Unnamed: 0_level_0,Water_Sample_Date,Casing_Depth,Well_Depth,Static_Water_Depth,Flow_Rate,pH,Longitude,Latitude,Rock_Type,Soil_Type_Condensed,Landuse_Condensed,Elevation,Stream_Distance,MCL,noise1,noise2,noise3,noise4,noise5
Unnamed: 0_level_1,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<fct>
1,2010-12-07,97,450,35,20.0,7.4,-80.55427,34.87224,MV,H,C,678.113,654.6816,0,K,65.0,10.0,7.334356,MV
2,2012-08-27,65,300,32,2.0,7.0,-80.55676,34.87109,MV,D,C,469.708,454.1912,0,H,67.36333,8.0,7.2,G
3,2010-04-05,65,300,32,2.0,8.1,-80.55676,34.87109,MV,D,C,470.293,454.1912,1,K,48.0,5.362296,7.4,M
4,2010-10-25,65,300,32,2.0,8.1,-80.55676,34.87109,MV,H,F,470.293,454.1912,1,K,42.07034,10.0,7.6,MV
5,2016-03-02,52,125,36,20.0,7.6,-80.5522,34.86012,MV,H,F,470.293,918.3419,0,H,40.0,30.0,8.0,M
6,2011-12-06,47,275,34,2.5,8.2,-80.56423,34.88559,MV,H,D,451.806,512.3955,0,H,58.0,8.0,7.694528,M


In [12]:
# calling fn
noise_rf_values_manganese_case_1 = rf_classification(noise_training_rf_dataset_manganese_case_1, "MCL", 
                                             "pred_MCL",1)

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases



In [13]:
# viewing results
noise_rf_confusion_matrix = data.frame(Model = "RF w/ Noise",
            noise_rf_values_manganese_case_1[[2]])

noise_rf_confusion_matrix

Model,Accuracy,LowerCI,UpperCI,Sensitivity,Specificity,PPV,NPV,AUC
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
RF w/ Noise,0.9118081,0.8640512,0.9467153,0.9359192,0.887697,0.8954849,0.9334937,0.9118081


Calculating the average noise rank to determine, which predictors ranked higher than the highest noise rank.

In [14]:
noise_importance_values_case_1 = noise_rf_values_manganese_case_1[[3]] %>%
    column_to_rownames(var = "Predictor")

In [15]:
highest_noise_rank = function(noise_importance_values){
    # this gives us values between 0 and 1
    # the closer the number gets to 1 the lower it ranks, so the number closest to 1 is the max rank
    noise1rank = which(rownames(noise_importance_values) == "noise1")/nrow(noise_importance_values)
    noise2rank = which(rownames(noise_importance_values) == "noise2")/nrow(noise_importance_values)
    noise3rank = which(rownames(noise_importance_values) == "noise3")/nrow(noise_importance_values)
    noise4rank = which(rownames(noise_importance_values) == "noise4")/nrow(noise_importance_values)
    noise5rank = which(rownames(noise_importance_values) == "noise5")/nrow(noise_importance_values)

    # figuring out which is the highest mean decrease gini or background noise importance value
    highest_noise_rank = min(noise1rank, noise2rank, noise3rank, noise4rank, noise5rank)
    
    return(highest_noise_rank)
}

# calling fn
highest_noise_rank_case_1 = highest_noise_rank(noise_importance_values_case_1)
highest_noise_rank_case_1

In [16]:
last_significant_rank = function(noise_importance_values, highest_noise_rank){
    for (i in 1:length(noise_importance_values$MeanDecreaseGini)){
    # if the row number (index)/# of variables is less than the highest noise rank the loop will continue
        if (i/length(noise_importance_values$MeanDecreaseGini) > highest_noise_rank){
            last_significant_var = i - 1
            break
        }
    }    
    return(last_significant_var)
}

# calling fn
last_significant_rank_case_1 = last_significant_rank(noise_importance_values_case_1, highest_noise_rank_case_1)
last_significant_rank_case_1

In [17]:
# now getting those significant predictors
significant_predictors_case_1 = rownames(noise_importance_values_case_1)[1:last_significant_rank_case_1]

In [18]:
# converting row names back to columns to make next step easier
noise_importance_values_case_1 = noise_importance_values_case_1 %>%
    rownames_to_column(var = "Predictor")

In [19]:
# getting dataframe ready to export variable importance ranks from above and whether those features passed the 
# background filter
noise_variable_importance_ranks = function(noise_importance_values_df, significant_predictors, use_case){
    # Determing what variables fall above the random noise variable importance ranks
    # :parameters: a dataframe containing the initial feature importance values, last position considered to be significant,
    # use case variables
    # :output: a dataframe containing the feature, mean decrease gini, significance, outcome predicted, variables included 
    # in the model
    
    # filtering the original rank df for predictors that were significant (above random noise)
    filtered_significant_ranks_df = noise_importance_values_df %>%
        filter(Predictor %in% significant_predictors) %>%
        arrange(-MeanDecreaseGini)
    
    # adding a column denoting if the feature was above random noise
    filtered_significant_ranks_df$Passed_Filter = c(rep(c("Yes"), 
                                                        times = length(filtered_significant_ranks_df$Predictor)))
    
    final_df = full_join(filtered_significant_ranks_df, noise_importance_values_df)
    
    # adding a col for outcome and the predictors used
    final_df$Use_Case = use_case
    
    return(final_df)
}

# calling fn
significant_predictors_df_case_1 = noise_variable_importance_ranks(noise_importance_values_case_1, 
                                                                   significant_predictors_case_1, 1)

# creating 1 df
significant_predictors_df_3_cases = rbind(rf_values_manganese_case_2[[3]], rf_values_manganese_case_3[[3]],
                                 rf_values_manganese_case_4[[3]]) %>%
# Adding a column to match the first use case df
mutate(Passed_Filter = NA)

significant_predictors_df = rbind(significant_predictors_df_case_1, significant_predictors_df_3_cases)

head(significant_predictors_df)

[1m[22mJoining with `by = join_by(Predictor, MeanDecreaseGini, Use_Case)`


Unnamed: 0_level_0,Predictor,MeanDecreaseGini,Use_Case,Passed_Filter
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>
1,Longitude,89.84516,1,Yes
2,Casing_Depth,45.55302,1,Yes
3,Latitude,38.5652,1,Yes
4,pH,26.24008,1,Yes
5,Elevation,24.45495,1,Yes
6,Water_Sample_Date,23.97867,1,Yes


# Support Vector Machine (SVM)
- supervised learning models that can predict continuous (regression) or grouped/dichotomous (classification) outcomes
- predicts by projecting them onto a high dimensional space and uses kernels to make the data more separable (unfortunately makes interpretability of model results more difficult)
- does a better job at handling a large number of predictors since p > n
- Compared to other classification algorithms, this approach can reliably classify chemicals while avoiding overfitting and reducing susceptibility to noisy or meaningless data

In [20]:
#model errors are calculated to assess the predictive accuracy of the model
svm_classification = function(dataset, model, outcome, pred_outcome, gamma_values, cost_values, elsilon_values, use_case){
  
    #setting seed for reproducibility
    set.seed(12)
    
    #splitting data into training and testing sets
    dataset_index = createFolds(dataset[[outcome]], k = 5) #5 fold CV
    metrics = data.frame()
    for (i in 1:length(dataset_index)){
        
        data_train = dataset[-dataset_index[[i]],]
        # using SMOTE to balance the class distribution
        balanced_data_train = smotenc(data_train, outcome)
        data_test = dataset[dataset_index[[i]],]

        #now pruning parameters (based on the training dataset to prevent overfitting)
        svr_tune <- tune(svm, as.formula(paste0(outcome, "~.")), data = balanced_data_train, kernel = model, 
                       ranges = list(elsilon = elsilon_values, cost = cost_values, gamma = gamma_values)) 

        #choosing best model
        best_svm <- svr_tune$best.model

        #predicting with tuned parameters 
        data_test[[pred_outcome]] <- predict(best_svm, newdata = data_test, type = "response")

        matrix = confusionMatrix(data = data_test[[pred_outcome]], reference = data_test[[outcome]], 
                                 positive = "1")
 
        #calculating AUC
        auc = auc(response = data_test[[outcome]], predictor = factor(data_test[[pred_outcome]], ordered = TRUE))
        # extracting accuracy, lower CI, upper CI, sens, spec, PPV to take mean later
        matrix_values = data.frame(t(c(matrix$overall[c(1,3,4)])), t(c(matrix$byClass[11])), 
                                   t(c(matrix$byClass[1:4])), auc)
   
        #adding values to df
        metrics = rbind(metrics, matrix_values)
  }
  
    # taking averages/sd 
    metrics = metrics %>%
        summarise(Accuracy = mean(Accuracy), LowerCI = mean(AccuracyLower), UpperCI = mean(AccuracyUpper),
                  Sensitivity = mean(Sensitivity), Specificity = mean(Specificity), PPV = mean(Pos.Pred.Value), 
                  NPV = mean(Neg.Pred.Value), AUC = mean(auc)) %>%
    mutate(Model = "SVM", Use_Case = use_case, Kernel = model) 

    # reordering columns
    metrics = metrics[,c(9:11,1:8)]
  
    return(metrics)
}

In [21]:
#calling fn
svm_case_1_linear = svm_classification(manganese_case_1_df, "linear", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),1)
svm_case_1_radial = svm_classification(manganese_case_1_df, "radial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),1)
svm_case_1_polynomial = svm_classification(manganese_case_1_df, "polynomial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),1)
svm_case_2_linear = svm_classification(manganese_case_2_df, "linear", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),2)
svm_case_2_radial = svm_classification(manganese_case_2_df, "radial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),2)
svm_case_2_polynomial = svm_classification(manganese_case_2_df, "polynomial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),2)
svm_case_3_linear = svm_classification(manganese_case_3_df, "linear", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),3)
svm_case_3_radial = svm_classification(manganese_case_3_df, "radial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),3)
svm_case_3_polynomial = svm_classification(manganese_case_3_df, "polynomial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),3)
svm_case_4_linear = svm_classification(manganese_case_4_df, "linear", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),4)
svm_case_4_radial = svm_classification(manganese_case_4_df, "radial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),4)
svm_case_4_polynomial = svm_classification(manganese_case_4_df, "polynomial", "MCL", "pred_MCL",
                                            0.035, 1:5, seq(0,1,0.2),4)

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting leve

In [22]:
# creating final df
final_df = rbind(rf_confusion_matrix, svm_case_1_linear, svm_case_1_radial, svm_case_1_polynomial, 
                 svm_case_2_linear, svm_case_2_radial, svm_case_2_polynomial, svm_case_3_linear, 
                 svm_case_3_radial, svm_case_3_polynomial, svm_case_4_linear, svm_case_4_radial, svm_case_4_polynomial) 

final_df

Model,Use_Case,Kernel,Accuracy,LowerCI,UpperCI,Sensitivity,Specificity,PPV,NPV,AUC
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
RF Classification,1,,0.8539911,0.7852977,0.9073309,0.4895425,0.9054323,0.4279656,0.9269879,0.6974874
RF Classification,2,,0.8371091,0.7661303,0.8935161,0.3751634,0.9022323,0.3418386,0.9116613,0.6386978
RF Classification,3,,0.8417182,0.7714723,0.8972241,0.351634,0.9105905,0.3668854,0.9091508,0.6311122
RF Classification,4,,0.8262848,0.7540981,0.8844357,0.5313725,0.8673905,0.3579872,0.9302196,0.6993815
SVM,1,linear,0.7541657,0.6751397,0.8223125,0.6718954,0.7659355,0.2864762,0.9440234,0.7189155
SVM,1,radial,0.8047727,0.7302514,0.8661769,0.5235294,0.8445032,0.3263629,0.926442,0.6840163
SVM,1,polynomial,0.7696592,0.6920668,0.8356845,0.6941176,0.7803742,0.3126476,0.9486809,0.7372459
SVM,2,linear,0.6839094,0.6008189,0.7591642,0.7058824,0.6810323,0.2356219,0.943593,0.6934573
SVM,2,radial,0.7570226,0.6781845,0.8248552,0.5235294,0.7900516,0.2580031,0.9218341,0.6567905
SVM,2,polynomial,0.6151895,0.5302537,0.6952245,0.7633987,0.5944903,0.2095143,0.9469985,0.6789445


In [23]:
# # exporting
write.xlsx(final_df, paste0(Output,"/", "Mn_Prediction_Confusion_Matrix_", cur_date, ".xlsx"), 
           rowNames = FALSE)
write.xlsx(significant_predictors_df, paste0(Output,"/", "Mn_Variable_Importance_", cur_date, ".xlsx"), rowNames = FALSE)