In [1]:
import os
import pandas as pd
from xgboost import XGBClassifier
# import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
# sklearn.metrics.accuracy_score
import xgboost as xgb
import numpy as np
import itertools 
import tqdm
from time import sleep
import random


In [2]:
import warnings 
warnings.filterwarnings("ignore")

In [3]:
# mannually generate the search grid
def makeGrid(pars_dict):  
    keys=pars_dict.keys()
    combinations=itertools.product(*pars_dict.values())
    ds=[dict(zip(keys,cc)) for cc in combinations]
    return ds

# Dataset Preparation

In [4]:
df_raw = pd.read_excel("../Dataset/MergedDataset_231207_ForElly_Excel.xlsx")

# Remove "." in the feature column
row_num, column_num = df_raw.shape
for rowID in range(row_num):
    for columnID in range(column_num):
        if "." == df_raw.iloc[rowID, columnID]:
            df_raw.iloc[rowID, columnID] = np.nan

# prepare the feature list
featureList = []
for item in df_raw.columns:
    if item.startswith("Item"):
        featureList.append(item)

## Use EndDesc as the label

In [5]:
df = df_raw[df_raw["EndDesc"] == df_raw["EndDesc"]]
df.reset_index().drop("index", axis=1)

print(df["EndDesc"].unique())
print(df.shape)

labelRangeList = df["EndDesc"].unique().tolist()
print(labelRangeList)
# labelContent = []
# for idx, row in df.iterrows():
#     index = labelRangeList.index(row["EndDesc"])
#     label = [0 for i in range(len(labelRangeList))]
#     label[index] = 1
#     labelContent.append(label)
# # df["label"] = labelList

# labelContent = np.array(labelContent)

# labelList = ["label_{}".format(i) for i in range(len(labelRangeList))]
# print(labelList)
# for idx, item in enumerate(labelList):
#     df[item] = labelContent[:, idx]

labelList = []
for idx, row in df.iterrows():
    label = labelRangeList.index(row["EndDesc"])
    labelList.append(label)
df["label"] = labelList

labelList = ["label"]

['Mutually agreed completion of treatment'
 'Termition of treatment earlier than Care Professiol planned'
 'Not suitable for IAPT service - no action taken or directed back to referrer'
 'Referred to another therapy service by mutual agreement']
(570, 279)
['Mutually agreed completion of treatment', 'Termition of treatment earlier than Care Professiol planned', 'Not suitable for IAPT service - no action taken or directed back to referrer', 'Referred to another therapy service by mutual agreement']


## Use RecoveryDesc as the label

In [None]:
df = df_raw[df_raw["RecoveryDesc"] == df_raw["RecoveryDesc"]]
df = df[(df["RecoveryDesc"].isin(["At recovery", "Not at recovery"]))]
df.reset_index().drop("index", axis=1)
print(df.shape)

labelList = []
for idx, row in df.iterrows():
    if (row["ReliableChangeDesc"] == "Reliable improvement") & (row["ReliableRecoveryDesc"] == "Reliable recovery") & (row["RecoveryDesc"] == "At recovery"):
        labelList.append(1)
    else:
        labelList.append(0)
df["label"] = labelList
labelList = ["label"]

## Train Test Split

In [6]:
# Keep 10% data as the test data
train, test = train_test_split(df, test_size=0.1)

## LightGBM training

In [12]:
# param_dict = {
#     'criterion': 'entropy', 
#     'max_depth': None, 
#     'max_features': 80, 
#     'max_leaf_nodes': None, 
#     'min_samples_leaf': 16, 
#     'min_samples_split': 64, 
#     'min_weight_fraction_leaf': 0.0
# }

param_dict = {
    'colsample_bytree': 0.7, 
    'learning_rate': 0.1, 
    'max_depth': -1, 
    'min_child_samples': 40, 
    'n_estimators': 10, 
    'n_jobs': -1, 
    'num_leaves': 15, 
    'random_state': 12181, 
    'reg_alpha': 10, 
    'reg_lambda': 0.0, 
    'subsample': 0.3, 
    'verbose': -1
}

In [13]:
# clf = lgb.LGBMClassifier(objective='multiclass',num_leaves=31,learning_rate=0.05,n_estimators=20)
clf = LGBMClassifier(verbose=-1)

In [14]:
scores = cross_val_score(clf, train[featureList].values, train[labelList].values.squeeze(), cv=5, scoring="accuracy")
print(scores)
print("Accuracy mean: {:.4f}, accuracy std: {:.4f}".format(np.mean(scores), np.std(scores)))

[0.60194175 0.55339806 0.49514563 0.61764706 0.58823529]
Accuracy mean: 0.5713, accuracy std: 0.0436


In [15]:
clf = LGBMClassifier(verbose=-1)
clf = clf.fit(train[featureList].values,train[labelList].values.squeeze())
preds = clf.predict(test[featureList].values)
labels = test[labelList].values
acc = accuracy_score(labels, preds)
print(acc)

0.543859649122807


In [16]:
acc = accuracy_score(labels, preds)
print(acc)

0.543859649122807


## Grid Search for LGBM

In [7]:
# Use MyGridSearch to estimate the runtime, then switch to standard Gridsearch to get better performance!

In [8]:
param_dict = {
    "n_estimators": [10, 50, 100, 150],
    "max_depth": [-1, 8, 16],
    "min_child_samples": [10, 20, 40],
    "num_leaves": [15, 31, 63],
    "learning_rate": [0.05, 0.1, 0.5],
    "colsample_bytree": [0.3, 0.5, 0.7, 1.0],
    "subsample": [0.3, 0.5, 0.7, 1.0],
    "reg_alpha": [0.0, 0.1, 0.5, 1, 10], 
    "reg_lambda": [0.0, 0.1, 0.5, 1, 10],
    "random_state": [random.randint(0, 32767)],
    "n_jobs": [-1],
    "verbose": [-1]
}

In [9]:
def MyGridSearch(param_dict, X, y):
    searchSpace = makeGrid(param_dict)
    print("Search Space Size:" + len(searchSpace).__str__())
    resultList = []
    for param in tqdm.tqdm(searchSpace):
        #sleep(1)
        clf = LGBMClassifier(**param)
        scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy", n_jobs=5)
        result = {}
        result["acc_mean"] = np.mean(scores)
        result["acc_std"] = np.std(scores)
        result["acc"] = scores
        result["param"] = param
        resultList.append(result)
    sortedResult = sorted(resultList, key=lambda x: x["acc_mean"], reverse=True)
    print(sortedResult[:5])
    return sortedResult

In [10]:
def StandardGridSearch(param_dict, X, y):
    clf = LGBMClassifier()
    grid_clf = GridSearchCV(clf, param_dict, n_jobs=5, cv=5, scoring='accuracy')
    grid_clf.fit(X,y)
    print("The best score is {}".format(grid_clf.best_score_))
    print("The best params is: {}".format(grid_clf.best_params_))
    return grid_clf

In [15]:
# Use my search grid
results = MyGridSearch(param_dict, train[featureList].values, train[labelList].values.squeeze())

Search Space Size:129600


  0%|          | 50/129600 [00:13<9:48:05,  3.67it/s]


KeyboardInterrupt: 

In [11]:
# Use standard search grid
results = StandardGridSearch(param_dict, train[featureList].values, train[labelList].values.squeeze())

The best score is 0.6159908623643633
The best params is: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 40, 'n_estimators': 10, 'n_jobs': -1, 'num_leaves': 15, 'random_state': 12181, 'reg_alpha': 10, 'reg_lambda': 0.0, 'subsample': 0.3, 'verbose': -1}
