# Random forest model

## Assessing the predictive ability of a random forest model
Using a training and testing split of the DCD and DBD datasets to fit and assess the predictive ability of a random forest model

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as mets
import time
%matplotlib inline

In [15]:
#Read in dataset with all rows included
df = pd.read_sas("Data/alldata3.sas7bdat")

#6931 DBD apps
dbd_apps = df[(df["eli_DBD"]==1)&(df["FAMILY_APPROACHED"]==2)]

#6060 DBD apps to match cohort in paper
dbd_apps = dbd_apps[(dbd_apps["eth_grp"]!=5)&(dbd_apps["FORMAL_APR_WHEN"]!=4)&(dbd_apps["donation_mentioned"]!=-1)
                    &(dbd_apps["FAMILY_WITNESS_BSDT"]!=9)&(dbd_apps["GENDER"]!=9)&(~dbd_apps["wish"].isin([2,3,4]))]
     
#9965 DCD apps
dcd_apps = df[(df["eli_DCD"]==1)&(df["FAMILY_APPROACHED"]==2)]

#9405 DCD apps to match cohort in paper
dcd_apps = dcd_apps[(dcd_apps["GENDER"]!=9)&(dcd_apps["cod_neuro"].notna())&(dcd_apps["eth_grp"]!=5)&(dcd_apps["donation_mentioned"]!=-1)
                    &(~dcd_apps["DTC_WD_TRTMENT_PRESENT"].isin([8,9]))&(~dcd_apps["wish"].isin([2,3,4]))]

# Columns used to create DBD model in paper
dbd_cols = ["wish", "FORMAL_APR_WHEN", "donation_mentioned", "app_nature", "eth_grp", "religion_grp", "GENDER", "FAMILY_WITNESS_BSDT", "DTC_PRESENT_BSD_CONV", 
            "acorn_new", "adult","FAMILY_CONSENT"]

dbd_apps[dbd_cols].astype(int)

dbd_model_data = dbd_apps[dbd_cols]
dbd_model_data2 = pd.get_dummies(data=dbd_model_data,columns=dbd_cols[:-1],drop_first=True)

dbd_features = dbd_model_data2.drop("FAMILY_CONSENT",axis=1)
dbd_consents = dbd_model_data2["FAMILY_CONSENT"]

# Columns used to create DCD model in paper
dcd_cols = ["wish", "donation_mentioned", 
            "app_nature", "eth_grp", "religion_grp", "GENDER", "DTC_WD_TRTMENT_PRESENT", 
            "acorn_new", "adult","cod_neuro","FAMILY_CONSENT"]

dcd_apps[dbd_cols].astype(int)

dcd_model_data = dcd_apps[dcd_cols]
dcd_model_data2 = pd.get_dummies(data=dcd_model_data,columns=dcd_cols[:-1],drop_first=True)

dcd_features = dcd_model_data2.drop("FAMILY_CONSENT",axis=1)
dcd_consents = dcd_model_data2["FAMILY_CONSENT"]

# creating a train and testing dataset for DBD and DCD approaches

# 1382 consents, 618 non-consents in test data
DBD_X_train, DBD_X_test, DBD_y_train, DBD_y_test = train_test_split(dbd_features,dbd_consents, test_size=0.33, random_state=10)

# 1865 consents, 1239 non-consents in test data
DCD_X_train, DCD_X_test, DCD_y_train, DCD_y_test = train_test_split(dcd_features,dcd_consents, test_size=0.33, random_state=10)

  rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  rslt[name] = self._string_chunk[js, :]


In [16]:
# fitting tree to training data 
tree_model = RandomForestClassifier(n_estimators=10)

In [17]:
DBD_tree = tree_model.fit(DBD_X_train,DBD_y_train)

DBD_preds = DBD_tree.predict(DBD_X_test)

cm = mets.confusion_matrix(DBD_y_test, DBD_preds)

print("TN  FN\nFP  TP\n")
print(str(int(cm[0,0])) + "    " + str(int(cm[0,1])))
print(str(int(cm[1,0])) + "    " + str(int(cm[1,1])) + "\n") 

# classification report for DBD model
print(mets.classification_report(DBD_y_test, DBD_preds))

print("Balanced accuracy: " + str(round(mets.balanced_accuracy_score(DBD_y_test, DBD_preds),2)))

print("Accuracy: " + str(round(mets.accuracy_score(DBD_y_test, DBD_preds),2)))

TN  FN
FP  TP

313    233
256    435

              precision    recall  f1-score   support

         1.0       0.55      0.57      0.56       546
         2.0       0.65      0.63      0.64       691

    accuracy                           0.60      1237
   macro avg       0.60      0.60      0.60      1237
weighted avg       0.61      0.60      0.61      1237

Balanced accuracy: 0.6
Accuracy: 0.6


In [5]:
# Plot tree
# fig,ax = plt.subplots(1,1,figsize=[100,100])
# plot_tree(DBD_tree,feature_names=dbd_features.columns,class_names=["Non-consent","Consent"])
# plt.show()

In [7]:
DCD_tree = tree_model.fit(DCD_X_train,DCD_y_train)

DCD_preds = DCD_tree.predict(DCD_X_test)

dcd_cm = mets.confusion_matrix(DCD_y_test, DCD_preds)

print("TN  FN\nFP  TP\n")
print(str(int(dcd_cm[0,0])) + "    " + str(int(dcd_cm[0,1])))
print(str(int(dcd_cm[1,0])) + "    " + str(int(dcd_cm[1,1])) + "\n") 

# classification report for DCD model
print(mets.classification_report(DCD_y_test, DCD_preds))

print("Cohen kappa: " + str(round(mets.cohen_kappa_score(DCD_y_test, DCD_preds),2)))

print("Balanced accuracy: " + str(round(mets.balanced_accuracy_score(DCD_y_test, DCD_preds),2)))

print("Accuracy: " + str(round(mets.accuracy_score(DCD_y_test, DCD_preds),2)))

TN  FN
FP  TP

762    477
468    1397

              precision    recall  f1-score   support

         1.0       0.62      0.62      0.62      1239
         2.0       0.75      0.75      0.75      1865

    accuracy                           0.70      3104
   macro avg       0.68      0.68      0.68      3104
weighted avg       0.70      0.70      0.70      3104

Cohen kappa: 0.36
Balanced accuracy: 0.68
Accuracy: 0.7


## Hyperparameter tuning

In [18]:
cv_forest_model = RandomForestClassifier(random_state=66)

weights = []
for w in np.arange(2,4,step=0.25):
    w_dic = {1.0:w,2.0:1}
    weights.append(w_dic)

params = {'max_depth':np.arange(1,200,step=25),'min_samples_split':np.arange(2,200,step=25),'class_weight':weights,'n_estimators':np.arange(10,100,step=10)}

start_time = time.time()

dbd_gs_forest_model = GridSearchCV(cv_forest_model, param_grid=params, scoring="balanced_accuracy",cv=5,n_jobs=3)

dbd_gs_forest_model.fit(DBD_X_train,DBD_y_train)

runtime = time.time() - start_time
print("Runtime = {}minutes".format(round(runtime/60,1)))

dbd_gs_forest_model.score(DBD_X_train,DBD_y_train)

print(dbd_gs_forest_model.best_params_)
print(dbd_gs_forest_model.best_score_)

Runtime = 9.0minutes
{'class_weight': {1.0: 2.0, 2.0: 1}, 'max_depth': 26, 'min_samples_split': 27, 'n_estimators': 10}
0.6149315941779839


In [19]:
DBD_preds = dbd_gs_forest_model.predict(DBD_X_test)

cm = mets.confusion_matrix(DBD_y_test, DBD_preds)

print("TN  FN\nFP  TP\n")
print(str(int(cm[0,0])) + "    " + str(int(cm[0,1])))
print(str(int(cm[1,0])) + "    " + str(int(cm[1,1])) + "\n") 

# classification report for DBD model
print(mets.classification_report(DBD_y_test, DBD_preds))

print("Balanced accuracy: " + str(round(mets.balanced_accuracy_score(DBD_y_test, DBD_preds),2)))

print("Accuracy: " + str(round(mets.accuracy_score(DBD_y_test, DBD_preds),2)))

TN  FN
FP  TP

429    117
386    305

              precision    recall  f1-score   support

         1.0       0.53      0.79      0.63       546
         2.0       0.72      0.44      0.55       691

    accuracy                           0.59      1237
   macro avg       0.62      0.61      0.59      1237
weighted avg       0.64      0.59      0.58      1237

Balanced accuracy: 0.61
Accuracy: 0.59


In [10]:
DBD_preds2 = dbd_gs_forest_model.predict(DBD_X_train)

cm = mets.confusion_matrix(DBD_y_train, DBD_preds2)

print("TN  FN\nFP  TP\n")
print(str(int(cm[0,0])) + "    " + str(int(cm[0,1])))
print(str(int(cm[1,0])) + "    " + str(int(cm[1,1])) + "\n") 

# classification report for DBD model
print(mets.classification_report(DBD_y_train, DBD_preds2))

print("Balanced accuracy: " + str(round(mets.balanced_accuracy_score(DBD_y_train, DBD_preds2),2)))

TN  FN
FP  TP

1032    150
1026    1852

              precision    recall  f1-score   support

         1.0       0.50      0.87      0.64      1182
         2.0       0.93      0.64      0.76      2878

    accuracy                           0.71      4060
   macro avg       0.71      0.76      0.70      4060
weighted avg       0.80      0.71      0.72      4060

Balanced accuracy: 0.76


In [6]:
cv_forest_model = RandomForestClassifier(random_state=66)

weights = []
for w in np.arange(2,4,step=0.25):
    w_dic = {1.0:w,2.0:1}
    weights.append(w_dic)

params = {'max_depth':np.arange(1,200,step=25),'min_samples_split':np.arange(2,200,step=25),'class_weight':weights,'n_estimators':np.arange(10,100,step=10)}

start_time = time.time()

dcd_gs_forest_model = GridSearchCV(cv_forest_model, param_grid=params, scoring="balanced_accuracy",cv=5,n_jobs=3)

dcd_gs_forest_model.fit(DCD_X_train,DCD_y_train)

runtime = time.time() - start_time
print("Runtime = {}minutes".format(round(runtime/60,1)))

dcd_gs_forest_model.score(DCD_X_train,DCD_y_train)

print(dcd_gs_forest_model.best_params_)
print(dcd_gs_forest_model.best_score_)

Runtime = 14.8minutes
{'class_weight': {1.0: 2.0, 2.0: 1}, 'max_depth': 26, 'min_samples_split': 77, 'n_estimators': 50}
0.7325624705327677


In [5]:
DCD_preds = dcd_gs_forest_model.predict(DCD_X_test)

cm = mets.confusion_matrix(DCD_y_test, DCD_preds)

print("TN  FN\nFP  TP\n")
print(str(int(cm[0,0])) + "    " + str(int(cm[0,1])))
print(str(int(cm[1,0])) + "    " + str(int(cm[1,1])) + "\n") 

# classification report for DBD model
print(mets.classification_report(DCD_y_test, DCD_preds))

print("Balanced accuracy: " + str(round(mets.balanced_accuracy_score(DCD_y_test, DCD_preds),2)))

print("Accuracy: " + str(round(mets.accuracy_score(DCD_y_test, DCD_preds),2)))

TN  FN
FP  TP

766    473
380    1485

              precision    recall  f1-score   support

         1.0       0.67      0.62      0.64      1239
         2.0       0.76      0.80      0.78      1865

    accuracy                           0.73      3104
   macro avg       0.71      0.71      0.71      3104
weighted avg       0.72      0.73      0.72      3104

Balanced accuracy: 0.71
Accuracy: 0.73
