In [1]:
# Basic models
import traceback
import numpy as np
import pandas as pd

# Data operation models
from sklearn.model_selection import StratifiedKFold
from sklearn import cross_validation
from sklearn.model_selection import ParameterGrid

# Classifier models
import lightgbm as lgb

# Evaluation models
import sklearn.metrics as sm



In [2]:
# Configurations for the model
def run_Config():
    data_name = 'Thoracic_Surgery_onehot.xlsx'
    k_holdout = 10
    k_cv = 5
    origin_pars = {
        'indicator_name': ['TPR','MAcc','F1(Micro)','F1(Macro)'],
        'is_unbalance':['True'],
        'application':['multiclass'],
        'num_class':[2],
        'boosting':['gbdt'],
        'num_iterations':[100],
        'learning_rate':[0.1],
        'num_leaves':[31], 
        'num_trees':[5,10,30]
    }
    pars = list(ParameterGrid(origin_pars))
    return data_name, k_holdout, k_cv, pars

In [3]:
# Load data
def run_load_data(data_name):
    all = pd.read_excel(data_name)
    all = all.values
    all_fea = all[:,:-1]
    all_label = get_normal_label(all[:,-1])
    return all_fea, all_label

def get_normal_label(y):
    y_uni = np.unique(np.array(y))
    for i in xrange(len(y_uni)):
        y[np.nonzero(y == y_uni[i])[0]] = i
    return y

In [4]:
# Run both HoldoutCV and GridSearchCV
def run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars):
    # HoldoutCV
    i_t = 0
    res_list = []
    opt_pars_list = []
    holdoutcv = StratifiedKFold(n_splits = k_holdout, shuffle = True)
    for i_learn, i_test in holdoutcv.split(all_fea, all_label):
        # Times
        i_t = i_t + 1
        # print 'Round ', str(i_t), ' Holdout CV----------------------'
        
        # Obtain current learning and heldout data
        learn_fea, test_fea = all_fea[i_learn], all_fea[i_test]
        learn_label, test_label = all_label[i_learn], all_label[i_test]
        
        # GridSearchCV
        j_t = 0
        optdata = {'score':0}
        for i_pars in pars:
            # times
            j_t = j_t + 1
            # print 'round ', str(j_t), ' gridsearch cv----------------'
            pars_score = []
            gridcv = StratifiedKFold(n_splits = k_cv, shuffle = True)
            for i_train, i_valid in gridcv.split(learn_fea, learn_label):
                # obtain current training and validation data
                train_fea, valid_fea = learn_fea[i_train], learn_fea[i_valid]
                train_label, valid_label = learn_label[i_train], learn_label[i_valid]
                # learn the model
                # i_pars = {'par_name1':par1,'par_name2':par2,...,'par_nameN':parN}
                valid_pre, pars_new = run_LGBM(i_pars, train_fea, train_label, valid_fea)
                eval_dict = run_evaluation(valid_pre, valid_label)
                grid_score = eval_dict[i_pars['indicator_name']]
                pars_score.append(grid_score)
            if np.mean(pars_score) > optdata['score']:
                optdata['pars'] = pars_new
                optdata['score'] = np.mean(pars_score)
                    
        # Holdout testing
        # best_pars is a dict too
        best_pars = optdata['pars']
        test_pre, _ = run_LGBM(best_pars, learn_fea, learn_label, test_fea)
        
        # Evaluate the prediction
        res_now = run_evaluation(test_pre, test_label)
        
        # Save results
        res_list.append(res_now)
        opt_pars_list.append(best_pars)
        
    return res_list, opt_pars_list

In [5]:
# Run LightGBM
def run_LGBM(p, X, y, Z):
    # p-hyper-parameters in dict
    # X-data
    # y-label
    # Z-data
    # zpre-predicted label for Z
    
    # generate datasets and assign hyper-parameters
    train_data = lgb.Dataset(X, label=y)
    param = {
        'is_unbalance':p['is_unbalance'],
        'application':p['application'],
        'num_class':p['num_class'],
        'boosting':p['boosting'],
        'num_iterations':p['num_iterations'],
        'learning_rate':p['learning_rate'],
        'num_leaves':p['num_leaves'], 
        'num_trees':p['num_trees']
    }
    
    # train and test
    clf = lgb.train(param, train_data)
    zscore = clf.predict(Z)
    zpre = np.argmax(zscore,axis=1)
    # print 'zpre:', zpre
    return zpre, p

In [6]:
# Evaluate the performance
def run_evaluation(p, y):
    res_dict = {}
    i_pos = np.nonzero(y == max(y))
    i_neg = np.nonzero(y == min(y))
    res_dict['TPR'] = (1 - sm.hamming_loss(y[i_pos], p[i_pos]))*100
    res_dict['TNR'] = (1 - sm.hamming_loss(y[i_neg], p[i_neg]))*100
    res_dict['MAcc'] = np.mean([res_dict['TPR'], res_dict['TNR']])
    res_dict['GM'] = np.sqrt(res_dict['TPR']*res_dict['TNR'])
    res_dict['F1(Macro)'] = sm.f1_score(y, p, average='macro')*100
    res_dict['F1(Micro)'] = sm.f1_score(y, p, average='micro')*100
    res_dict['Acc'] = sm.accuracy_score(y, p)*100
    return res_dict

In [7]:
def get_dict(d):
    list_final = []
    for i in d:
        list_now = i + '_' + str(d[i])
        list_final.append(list_now)
    return list_final

In [None]:
if __name__ == '__main__':
    data_name, k_holdout, k_cv, pars = run_Config()
    all_fea, all_label = run_load_data(data_name)
    res_list, opt_pars_list = run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars)
    # print 'Each heldout cv result:'
    # print '-----------------------'
    for i, j in zip(res_list,opt_pars_list):
        print get_dict(i), 'with hyper-parameters:'
        print get_dict(j)
        print '-----------------------'