### Importing required libraries


In [13]:
import logging
import numpy as np
import pandas as pd
from dask.distributed import Client
from dask import delayed, compute
from helpers import get_train_test, RANDOM_STATE, RFB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix
)

In [14]:
client = Client(threads_per_worker=2, n_workers=4,silence_logs=logging.ERROR)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 52666 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:52666/status,

0,1
Dashboard: http://127.0.0.1:52666/status,Workers: 4
Total threads: 8,Total memory: 7.80 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:52667,Workers: 4
Dashboard: http://127.0.0.1:52666/status,Total threads: 8
Started: Just now,Total memory: 7.80 GiB

0,1
Comm: tcp://127.0.0.1:52701,Total threads: 2
Dashboard: http://127.0.0.1:52703/status,Memory: 1.95 GiB
Nanny: tcp://127.0.0.1:52672,
Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-6ctq6sj1,Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-6ctq6sj1

0,1
Comm: tcp://127.0.0.1:52702,Total threads: 2
Dashboard: http://127.0.0.1:52705/status,Memory: 1.95 GiB
Nanny: tcp://127.0.0.1:52671,
Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-1164mq3u,Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-1164mq3u

0,1
Comm: tcp://127.0.0.1:52707,Total threads: 2
Dashboard: http://127.0.0.1:52708/status,Memory: 1.95 GiB
Nanny: tcp://127.0.0.1:52673,
Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-it3zf5o0,Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-it3zf5o0

0,1
Comm: tcp://127.0.0.1:52698,Total threads: 2
Dashboard: http://127.0.0.1:52699/status,Memory: 1.95 GiB
Nanny: tcp://127.0.0.1:52670,
Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-010fxo6d,Local directory: C:\Users\shula\AppData\Local\Temp\dask-worker-space\worker-010fxo6d


In [15]:
df_1 = pd.read_csv('../dataset/processed_dataset/final_data_S1.csv')

In [16]:
y_1 = df_1['class']
X_1 = df_1.drop(['class'], axis=1)
X_train_1, X_test_1, y_train_1, y_test_1 = get_train_test(X_1, y_1)

In [17]:
params = {
    'n_estimators': [x for x in range(3, 20, 2)], 
    'criterion': ['gini', 'entropy'], 
    'max_depth': [x for x in range(4, 10)],
    'min_samples_split': [x for x in range(2, 10)],
    'max_features': ['sqrt', 'log2', None],
}

In [12]:
param_dict = ParameterGrid(params)
print(len(list(param_dict)))

2592


In [72]:

@delayed
def run_model(param, X_train, y_train, X_test, y_test):
    # if check_param(param):
    model_obj = RandomForestClassifier(**param,
                                    class_weight='balanced', bootstrap=True, oob_score=True, random_state=RANDOM_STATE)
    model_obj.fit(X_train, y_train)
    param.update({
        "accuracy_score":accuracy_score(y_test, model_obj.predict(X_test)),
        "f1_score":f1_score(y_test, model_obj.predict(X_test)),
        "precision_score":precision_score(y_test, model_obj.predict(X_test)),
        "recall_score":recall_score(y_test, model_obj.predict(X_test)),
        "roc_auc_score":roc_auc_score(y_test, model_obj.predict(X_test)),
        "confusion_matrix":str(confusion_matrix(y_test, model_obj.predict(X_test)))
    })
    return pd.DataFrame(param,index=[0])

# create dask parallel function to run xgb

In [74]:
dfs = (run_model(each_param, X_train_1, y_train_1, X_test_1, y_test_1) for each_param in param_dict)

In [76]:
value = compute(dfs,num_workers=4)

In [77]:
final = pd.concat(value[0])
value = []
final.reset_index(drop=True,inplace=True)

In [78]:
FILE_PATH = 'param/final_data_S1_RF_test1.csv'
final.to_csv(FILE_PATH, index=False)

In [79]:
df_params = pd.read_csv(FILE_PATH)
df_params

Unnamed: 0,criterion,max_depth,max_features,min_samples_split,n_estimators,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,confusion_matrix
0,gini,4,sqrt,2,5,0.802632,0.871795,0.842975,0.902655,0.707738,[[ 20 19]\n [ 11 102]]
1,gini,4,sqrt,2,7,0.782895,0.857143,0.838983,0.876106,0.694463,[[20 19]\n [14 99]]
2,gini,4,sqrt,2,9,0.822368,0.884120,0.858333,0.911504,0.737803,[[ 22 17]\n [ 10 103]]
3,gini,4,sqrt,2,11,0.815789,0.879310,0.857143,0.902655,0.733379,[[ 22 17]\n [ 11 102]]
4,gini,4,sqrt,2,13,0.809211,0.875536,0.850000,0.902655,0.720558,[[ 21 18]\n [ 11 102]]
...,...,...,...,...,...,...,...,...,...,...,...
2586,entropy,9,,9,11,0.815789,0.883333,0.834646,0.938053,0.699796,[[ 18 21]\n [ 7 106]]
2587,entropy,9,,9,13,0.815789,0.883333,0.834646,0.938053,0.699796,[[ 18 21]\n [ 7 106]]
2588,entropy,9,,9,15,0.809211,0.879668,0.828125,0.938053,0.686975,[[ 17 22]\n [ 7 106]]
2589,entropy,9,,9,17,0.815789,0.884298,0.829457,0.946903,0.691400,[[ 17 22]\n [ 6 107]]


In [80]:
df_params.sort_values(['recall_score'], axis=0, ascending=False, inplace=True)
# final.to_csv('param/sorted_final_data_S1_RF_test1.csv',index=False)

In [81]:
df_params.head()

Unnamed: 0,criterion,max_depth,max_features,min_samples_split,n_estimators,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,confusion_matrix
2382,entropy,9,sqrt,2,17,0.842105,0.901639,0.839695,0.973451,0.717495,[[ 18 21]\n [ 3 110]]
1218,gini,9,log2,9,11,0.842105,0.901639,0.839695,0.973451,0.717495,[[ 18 21]\n [ 3 110]]
1113,gini,9,sqrt,5,17,0.802632,0.88,0.80292,0.973451,0.640572,[[ 12 27]\n [ 3 110]]
653,gini,7,sqrt,2,15,0.861842,0.912863,0.859375,0.973451,0.755956,[[ 21 18]\n [ 3 110]]
652,gini,7,sqrt,2,13,0.848684,0.90535,0.846154,0.973451,0.730315,[[ 19 20]\n [ 3 110]]


#### Best Hyperparameters of RandomForest Classifier test in custom bagging of Decision Trees.  
##### Criterion: Entropy
##### Max_depth: 5
##### Max_Features: None
##### Min_feature_split: 7
##### n_estimators: 11

In [18]:
random_forest_classifier = RFB(n_estimators=11, criterion='entropy', max_depth=5, min_samples_split=7, class_weight='balanced', random_state=142)

In [19]:
random_forest_classifier.fit(X_train_1, y_train_1)
preds = random_forest_classifier.predict(X_test_1)

In [20]:
print(f"Accuracy: {accuracy_score(y_test_1, preds)} \n f1_score: {f1_score(y_test_1, preds)} \n Precision: {precision_score(y_test_1, preds)} \n Recall: {recall_score(y_test_1, preds)} \n Roc_auc: {roc_auc_score(y_test_1, preds)} \n Confusion Matrix: \n {confusion_matrix(y_test_1, preds)}")

Accuracy: 0.7828947368421053 
 f1_score: 0.8520179372197311 
 Precision: 0.8636363636363636 
 Recall: 0.8407079646017699 
 Roc_auc: 0.7280462899931927 
 Confusion Matrix: 
 [[24 15]
 [18 95]]
