In [1]:
import os
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
# sklearn.metrics.accuracy_score
import xgboost as xgb
import numpy as np
import itertools 
import tqdm
from time import sleep


from sklearn import tree

In [2]:
import warnings 
warnings.filterwarnings("ignore")

In [3]:
# mannually generate the search grid
def makeGrid(pars_dict):  
    keys=pars_dict.keys()
    combinations=itertools.product(*pars_dict.values())
    ds=[dict(zip(keys,cc)) for cc in combinations]
    return ds

# Dataset Preparation

In [4]:
df_raw = pd.read_excel("../Dataset/MergedDataset_231207_ForElly_Excel.xlsx")

# Remove "." in the feature column
row_num, column_num = df_raw.shape
for rowID in range(row_num):
    for columnID in range(column_num):
        if "." == df_raw.iloc[rowID, columnID]:
            df_raw.iloc[rowID, columnID] = np.nan

# prepare the feature list
featureList = []
for item in df_raw.columns:
    if item.startswith("Item"):
        featureList.append(item)

## Use EndDesc as the label

In [5]:
df = df_raw[df_raw["EndDesc"] == df_raw["EndDesc"]]
df.reset_index().drop("index", axis=1)

print(df["EndDesc"].unique())
print(df.shape)

labelRangeList = df["EndDesc"].unique().tolist()
print(labelRangeList)
labelList = []
for idx, row in df.iterrows():
    label = labelRangeList.index(row["EndDesc"])
    labelList.append(label)
df["label"] = labelList

labelList = ["label"]

['Mutually agreed completion of treatment'
 'Termition of treatment earlier than Care Professiol planned'
 'Not suitable for IAPT service - no action taken or directed back to referrer'
 'Referred to another therapy service by mutual agreement']
(570, 279)
['Mutually agreed completion of treatment', 'Termition of treatment earlier than Care Professiol planned', 'Not suitable for IAPT service - no action taken or directed back to referrer', 'Referred to another therapy service by mutual agreement']


## Use RecoveryDesc as the label

In [5]:
df = df_raw[df_raw["RecoveryDesc"] == df_raw["RecoveryDesc"]]
df = df[(df["RecoveryDesc"].isin(["At recovery", "Not at recovery"]))]
df.reset_index().drop("index", axis=1)
print(df.shape)

labelList = []
for idx, row in df.iterrows():
    if (row["ReliableChangeDesc"] == "Reliable improvement") & (row["ReliableRecoveryDesc"] == "Reliable recovery") & (row["RecoveryDesc"] == "At recovery"):
        labelList.append(1)
    else:
        labelList.append(0)
df["label"] = labelList
labelList = ["label"]

(391, 279)


# Train Test Split

In [6]:
# Keep 10% data as the test data
train, test = train_test_split(df, test_size=0.1)

## DT training

In [37]:
# EndDesc: 0.5964912280701754
# param_dict = {
#     'criterion': 'gini', 
#     'max_depth': 8, 
#     'max_features': 20, 
#     'max_leaf_nodes': None, 
#     'min_samples_leaf': 4, 
#     'min_samples_split': 64, 
#     'min_weight_fraction_leaf': 0.0
# }

# Recovery: 0.60
param_dict = {
    'criterion': 'entropy', 
    'max_depth': None, 
    'max_features': 80, 
    'max_leaf_nodes': None, 
    'min_samples_leaf': 16, 
    'min_samples_split': 64, 
    'min_weight_fraction_leaf': 0.0
}
clf = tree.DecisionTreeClassifier(**param_dict)

In [45]:
scores = cross_val_score(clf, train[featureList].values, train[labelList].values, cv=5, scoring="accuracy")
print(scores)
print("Accuracy mean: {:.4f}, accuracy std: {:.4f}".format(np.mean(scores), np.std(scores)))

[0.47887324 0.52857143 0.51428571 0.6        0.48571429]
Accuracy mean: 0.5215, accuracy std: 0.0433


In [98]:
clf = tree.DecisionTreeClassifier(**param_dict)
clf = clf.fit(train[featureList].values,train[labelList].values)
preds = clf.predict(test[featureList].values)
labels = test[labelList].values
acc = accuracy_score(labels, preds)
print(acc)

0.55


## Grid Search for DT

In [7]:
# Use MyGridSearch to estimate the runtime, then switch to standard Gridsearch to get better performance!

In [7]:
param_dict = {
    "criterion": ["gini","entropy","log_loss"],
    "max_depth": [None,2,8,16],
    "min_samples_split": [2,4,16,64],
    "min_samples_leaf": [1,4,16,64],
    "min_weight_fraction_leaf": [0.0, 0.2, 0.4, 0.5],
    "max_features": [None, 20, 40, 80],
    "max_leaf_nodes": [None, 20, 40, 80],   
}

In [8]:
def MyGridSearch(param_dict, X, y):
    searchSpace = makeGrid(param_dict)
    print("Search Space Size:" + len(searchSpace).__str__())
    resultList = []
    for param in tqdm.tqdm(searchSpace):
        #sleep(1)
        clf = tree.DecisionTreeClassifier(**param)
        scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy", n_jobs=5)
        result = {}
        result["acc_mean"] = np.mean(scores)
        result["acc_std"] = np.std(scores)
        result["acc"] = scores
        result["param"] = param
        resultList.append(result)
    sortedResult = sorted(resultList, key=lambda x: x["acc_mean"], reverse=True)
    print(sortedResult[:5])
    return sortedResult

In [9]:
def StandardGridSearch(param_dict, X, y):
    clf = tree.DecisionTreeClassifier()
    grid_clf = GridSearchCV(clf, param_dict, n_jobs=5, cv=5, scoring='accuracy')
    grid_clf.fit(X,y)
    print("The best score is {}".format(grid_clf.best_score_))
    print("The best params is: {}".format(grid_clf.best_params_))
    return grid_clf

In [10]:
# Use my search grid
results = MyGridSearch(param_dict, train[featureList].values, train[labelList].values)

Search Space Size:12288


100%|██████████| 12288/12288 [04:41<00:00, 43.64it/s]

[{'acc_mean': 0.6097786720321932, 'acc_std': 0.0231636993496265, 'acc': array([0.57746479, 0.6       , 0.62857143, 0.6       , 0.64285714]), 'param': {'criterion': 'log_loss', 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 16, 'min_weight_fraction_leaf': 0.0, 'max_features': 80, 'max_leaf_nodes': 40}}, {'acc_mean': 0.6096981891348088, 'acc_std': 0.03548565695956081, 'acc': array([0.6056338 , 0.57142857, 0.65714286, 0.64285714, 0.57142857]), 'param': {'criterion': 'log_loss', 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.2, 'max_features': 40, 'max_leaf_nodes': 40}}, {'acc_mean': 0.6068812877263581, 'acc_std': 0.03009495422511941, 'acc': array([0.5915493 , 0.57142857, 0.64285714, 0.64285714, 0.58571429]), 'param': {'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 64, 'min_weight_fraction_leaf': 0.0, 'max_features': 40, 'max_leaf_nodes': None}}, {'acc_mean': 0.6040241448692153, 'acc_std': 0.0




In [13]:
# Use standard search grid
results = StandardGridSearch(param_dict, train[featureList].values, train[labelList].values)

The best score is 0.5984708249496982
The best params is: {'criterion': 'entropy', 'max_depth': None, 'max_features': 80, 'max_leaf_nodes': None, 'min_samples_leaf': 16, 'min_samples_split': 64, 'min_weight_fraction_leaf': 0.0}
