In [1]:
import os
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
# sklearn.metrics.accuracy_score
import xgboost as xgb
import numpy as np
import itertools 
import tqdm
from time import sleep
import random

In [2]:
random.randint(0, 32767)

3333

In [3]:
import warnings 
warnings.filterwarnings("ignore")

In [4]:
# mannually generate the search grid
def makeGrid(pars_dict):  
    keys=pars_dict.keys()
    combinations=itertools.product(*pars_dict.values())
    ds=[dict(zip(keys,cc)) for cc in combinations]
    return ds

# Dataset Preparation

In [5]:
df_raw = pd.read_excel("../Dataset/MergedDataset_231207_ForElly_Excel.xlsx")

# Remove "." in the feature column
row_num, column_num = df_raw.shape
for rowID in range(row_num):
    for columnID in range(column_num):
        if "." == df_raw.iloc[rowID, columnID]:
            df_raw.iloc[rowID, columnID] = np.nan

# prepare the feature list
featureList = []
for item in df_raw.columns:
    if item.startswith("Item"):
        featureList.append(item)

## Use EndDesc as the label

In [6]:
df = df_raw[df_raw["EndDesc"] == df_raw["EndDesc"]]
df.reset_index().drop("index", axis=1)

print(df["EndDesc"].unique())
print(df.shape)

labelRangeList = df["EndDesc"].unique().tolist()
print(labelRangeList)
labelList = []
for idx, row in df.iterrows():
    label = labelRangeList.index(row["EndDesc"])
    labelList.append(label)
df["label"] = labelList

labelList = ["label"]

['Mutually agreed completion of treatment'
 'Termition of treatment earlier than Care Professiol planned'
 'Not suitable for IAPT service - no action taken or directed back to referrer'
 'Referred to another therapy service by mutual agreement']
(570, 279)
['Mutually agreed completion of treatment', 'Termition of treatment earlier than Care Professiol planned', 'Not suitable for IAPT service - no action taken or directed back to referrer', 'Referred to another therapy service by mutual agreement']


## Use RecoveryDesc as the label

In [6]:
df = df_raw[df_raw["RecoveryDesc"] == df_raw["RecoveryDesc"]]
df = df[(df["RecoveryDesc"].isin(["At recovery", "Not at recovery"]))]
df.reset_index().drop("index", axis=1)
print(df.shape)

labelList = []
for idx, row in df.iterrows():
    if (row["ReliableChangeDesc"] == "Reliable improvement") & (row["ReliableRecoveryDesc"] == "Reliable recovery") & (row["RecoveryDesc"] == "At recovery"):
        labelList.append(1)
    else:
        labelList.append(0)
df["label"] = labelList
labelList = ["label"]

(391, 279)


# Train Test Split

In [7]:
# Keep 10% data as the test data
train, test = train_test_split(df, test_size=0.1)

## RF training

In [14]:
# EndDesc: 0.631578947368421
param_dict = {
    'criterion': 'entropy', 
    'max_depth': None, 
    'max_features': 20, 
    'max_leaf_nodes': None, 
    'max_samples': 0.7, 
    'min_samples_leaf': 1, 
    'min_samples_split': 2, 
    'min_weight_fraction_leaf': 0.0, 
    'n_estimators': 50, 
    'n_jobs': -1, 
    'random_state': 24975
}
# clf = tree.DecisionTreeClassifier(**param_dict)
clf = RandomForestClassifier(**param_dict)

In [15]:
scores = cross_val_score(clf, train[featureList].values, train[labelList].values.squeeze(), cv=5, scoring="accuracy")
print(scores)
print("Accuracy mean: {:.4f}, accuracy std: {:.4f}".format(np.mean(scores), np.std(scores)))

[0.58252427 0.57281553 0.58252427 0.67647059 0.64705882]
Accuracy mean: 0.6123, accuracy std: 0.0416


In [16]:
clf = RandomForestClassifier(n_estimators=50, random_state=random.randint(0, 32767))
clf = clf.fit(train[featureList].values,train[labelList].values.squeeze())
preds = clf.predict(test[featureList].values)
labels = test[labelList].values
acc = accuracy_score(labels, preds)
print(acc)

0.631578947368421


## Grid Search for RF

In [8]:
# Use MyGridSearch to estimate the runtime, then switch to standard Gridsearch to get better performance!

In [8]:
param_dict = {
    "n_estimators": [10, 50, 100, 150],
    "criterion": ["gini","entropy","log_loss"],
    "max_depth": [None, 8, 16],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 4],
    "min_weight_fraction_leaf": [0.0, 0.3, 0.5],
    "max_features": [None, "sqrt", "log2", 20, 80],
    "max_leaf_nodes": [None, 20, 40, 80],
    "max_samples": [None, 0.3, 0.5, 0.7],
    "random_state": [random.randint(0, 32767)],
    "n_jobs": [-1],
}

In [9]:
def MyGridSearch(param_dict, X, y):
    searchSpace = makeGrid(param_dict)
    print("Search Space Size:" + len(searchSpace).__str__())
    resultList = []
    for param in tqdm.tqdm(searchSpace):
        #sleep(1)
        clf = RandomForestClassifier(**param)
        scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy", n_jobs=5)
        result = {}
        result["acc_mean"] = np.mean(scores)
        result["acc_std"] = np.std(scores)
        result["acc"] = scores
        result["param"] = param
        resultList.append(result)
    sortedResult = sorted(resultList, key=lambda x: x["acc_mean"], reverse=True)
    print(sortedResult[:5])
    return sortedResult

In [10]:
def StandardGridSearch(param_dict, X, y):
    clf = RandomForestClassifier()
    grid_clf = GridSearchCV(clf, param_dict, n_jobs=5, cv=5, scoring='accuracy')
    grid_clf.fit(X,y)
    print("The best score is {}".format(grid_clf.best_score_))
    print("The best params is: {}".format(grid_clf.best_params_))
    return grid_clf

In [58]:
# Use my search grid
results = MyGridSearch(param_dict, train[featureList].values, train[labelList].values.squeeze())

Search Space Size:34560


  1%|          | 241/34560 [00:13<32:29, 17.60it/s]


KeyboardInterrupt: 

In [11]:
# Use standard search grid
results = StandardGridSearch(param_dict, train[featureList].values, train[labelList].values.squeeze())

The best score is 0.5983098591549295
The best params is: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': 20, 'max_samples': 0.7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': -1, 'random_state': 18879}
