# Random forest model

## Assessing the predictive ability of a random forest model
Using a training and testing split of the DCD and DBD datasets to fit and assess the predictive ability of a random forest model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as mets
import time
%matplotlib inline

# Function to print confusion matrix, balanced accuracy and accuracy for a set of actual and predicted labels
def show_metrics(actual,predict):
    """ Prints the confusion matrix, balanced accuracy and accuracy given datasets of actual and predicted labels
    
    Arguments:
        actual - Dataset of actual labels
        predict - Dataset of predicted labels
     """
    cm = mets.confusion_matrix(actual, predict)
    
    print("********* MODEL METRIC REPORT *********\n\nConfusion matrix:\n")

    print("TP  FN\nFP  TN\n") #this is a reminder of what each part of the confusion matrix means e.g. TP = True Positive
    
    # print the confusion matrix
    print(str(int(cm[0,0])) + "    " + str(int(cm[0,1])))
    print(str(int(cm[1,0])) + "    " + str(int(cm[1,1])) + "\n") 

    # classification report for DBD model
    print("Classification report:\n")
    print(mets.classification_report(actual, predict))

    print("Balanced accuracy: " + str(round(mets.balanced_accuracy_score(actual, predict),2)))

    print("Accuracy: " + str(round(mets.accuracy_score(actual, predict),2)))
    
    # Predicted vs actual consent rates
    cons_rate = int(100 * len(actual[actual=="Consent"]) / len(actual) )
    print("\nActual consent rate: " + str(cons_rate))
    
    pred_rate = int(100 * len(predict[predict=="Consent"]) / len(predict) )
    print("Predicted consent rate: " + str(pred_rate))
    
    pass
 
# Function to format consent column from integer code to text
def format_consent(x):
    if x == 2:
        return "Consent"
    if x == 1:
        return "Non-consent"

In [2]:
# Read in datasets 
dbd_model_data = pd.read_csv("Data/dbd_model_data.csv")
dcd_model_data = pd.read_csv("Data/dcd_model_data.csv")

# Columns used to create DBD model
dbd_cols = ["wish", "FORMAL_APR_WHEN", "donation_mentioned", "app_nature", "eth_grp", "religion_grp", "GENDER", "FAMILY_WITNESS_BSDT", "DTC_PRESENT_BSD_CONV", 
            "acorn_new", "adult","FAMILY_CONSENT"]

dbd_model_data2 = pd.get_dummies(data=dbd_model_data,columns=dbd_cols[:-1],drop_first=True)

dbd_features = dbd_model_data2.drop("FAMILY_CONSENT",axis=1)
dbd_consents = dbd_model_data2["FAMILY_CONSENT"].apply(format_consent)

# Columns used to create DCD model in paper
dcd_cols = ["wish", "donation_mentioned", 
            "app_nature", "eth_grp", "religion_grp", "GENDER", "DTC_WD_TRTMENT_PRESENT", 
            "acorn_new", "adult","cod_neuro","FAMILY_CONSENT"]

dcd_model_data2 = pd.get_dummies(data=dcd_model_data,columns=dcd_cols[:-1],drop_first=True)

dcd_features = dcd_model_data2.drop("FAMILY_CONSENT",axis=1)
dcd_consents = dcd_model_data2["FAMILY_CONSENT"].apply(format_consent)

# creating a train and testing dataset for DBD and DCD approaches
DBD_X_train, DBD_X_test, DBD_y_train, DBD_y_test = train_test_split(dbd_features,dbd_consents, test_size=0.33, random_state=10)

DCD_X_train, DCD_X_test, DCD_y_train, DCD_y_test = train_test_split(dcd_features,dcd_consents, test_size=0.33, random_state=10)

## Hyperparameter tuning

In [6]:
cv_forest_model = RandomForestClassifier(random_state=66)

weights = []
for w in np.arange(2,4,step=0.25):
    w_dic = {"Non-consent":w,"Consent":1}
    weights.append(w_dic)

params = {'max_depth':np.arange(1,200,step=25),'min_samples_split':np.arange(2,200,step=25),'class_weight':weights,'n_estimators':np.arange(10,100,step=10)}

start_time = time.time()

dbd_gs_forest_model = GridSearchCV(cv_forest_model, param_grid=params, scoring="balanced_accuracy",cv=5,n_jobs=3)

dbd_gs_forest_model.fit(DBD_X_train,DBD_y_train)

runtime = time.time() - start_time
print("Runtime = {}minutes".format(round(runtime/60,1)))

dbd_gs_forest_model.score(DBD_X_train,DBD_y_train)

print(dbd_gs_forest_model.best_params_)
print(dbd_gs_forest_model.best_score_)

Runtime = 11.5minutes
{'class_weight': {'Non-consent': 2.75, 'Consent': 1}, 'max_depth': 26, 'min_samples_split': 177, 'n_estimators': 50}
0.7497479664748189


In [7]:
DBD_preds = dbd_gs_forest_model.predict(DBD_X_test)

show_metrics(DBD_y_test,DBD_preds)

********* MODEL METRIC REPORT *********

Confusion matrix:

TP  FN
FP  TN

858    524
77    541

Classification report:

              precision    recall  f1-score   support

     Consent       0.92      0.62      0.74      1382
 Non-consent       0.51      0.88      0.64       618

    accuracy                           0.70      2000
   macro avg       0.71      0.75      0.69      2000
weighted avg       0.79      0.70      0.71      2000

Balanced accuracy: 0.75
Accuracy: 0.7

Actual consent rate: 69
Predicted consent rate: 46


In [9]:
params = {'max_depth':np.arange(1,200,step=25),'min_samples_split':np.arange(2,200,step=25),'class_weight':weights,'n_estimators':np.arange(10,100,step=10)}

start_time = time.time()

dcd_gs_forest_model = GridSearchCV(cv_forest_model, param_grid=params, scoring="balanced_accuracy",cv=5,n_jobs=3)

dcd_gs_forest_model.fit(DCD_X_train,DCD_y_train)

runtime = time.time() - start_time
print("Runtime = {}minutes".format(round(runtime/60,1)))

dcd_gs_forest_model.score(DCD_X_train,DCD_y_train)

print(dcd_gs_forest_model.best_params_)
print(dcd_gs_forest_model.best_score_)

Runtime = 15.1minutes
{'class_weight': {'Non-consent': 2.0, 'Consent': 1}, 'max_depth': 26, 'min_samples_split': 77, 'n_estimators': 50}
0.7325624705327677


In [10]:
DCD_preds = dcd_gs_forest_model.predict(DCD_X_test)

show_metrics(DCD_y_test,DCD_preds)

********* MODEL METRIC REPORT *********

Confusion matrix:

TP  FN
FP  TN

1029    836
142    1097

Classification report:

              precision    recall  f1-score   support

     Consent       0.88      0.55      0.68      1865
 Non-consent       0.57      0.89      0.69      1239

    accuracy                           0.68      3104
   macro avg       0.72      0.72      0.68      3104
weighted avg       0.75      0.68      0.68      3104

Balanced accuracy: 0.72
Accuracy: 0.68

Actual consent rate: 60
Predicted consent rate: 37
