### Binary-class Threshold-based Rule Discovery

In [37]:
# Basic models
import traceback
import numpy as np
import pandas as pd
import scipy.stats as scs

# Data operation models
from sklearn.model_selection import StratifiedKFold
from sklearn import cross_validation
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import LabelEncoder

# Evaluation models
import sklearn.metrics as sm

In [38]:
# Configurations for the model
def run_Config():
    data_name = 'Thoracic_test.xlsx'
    k_holdout = 5
    k_cv = 5
    origin_pars = {
        'grid_indicator':['GM'],
        'indicator':['F1(Macro)','GM','Acc_Macro'],
        'chi_name': ['p','chi2'],
        'chi_value':[0.05]
    }
    pars = list(ParameterGrid(origin_pars))
    return data_name, k_holdout, k_cv, pars

In [39]:
# Load data
def run_load_data(data_name):
    all = pd.read_excel(data_name)
    label = all.iloc[:, -1]    # get labels
    data = all.iloc[:, :-1] # get data
    # some preprocessing...
    return data, label

In [40]:
# Run both HoldoutCV and GridSearchCV
def run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars):
    # HoldoutCV
    i_t = 0
    res_list = []
    opt_pars_list = []
    all_label_num = get_SimpleNumCode(all_label)
    all_fea_num = np.ones(all_label_num.shape)
    
    holdoutcv = StratifiedKFold(n_splits = k_holdout, shuffle = True)
    for i_learn, i_test in holdoutcv.split(all_fea_num, all_label_num):
        # Times
        i_t = i_t + 1
        print 'Round ', str(i_t), ' Holdout CV----------------------'
        
        # Obtain current learning and heldout data
        learn_fea, test_fea = all_fea.iloc[i_learn,:], all_fea.iloc[i_test,:]
        learn_label, test_label = all_label.iloc[i_learn], all_label.iloc[i_test]
        
        # GridSearchCV
        j_t = 0
        optdata = {'score':0}
        for i_pars in pars:
            # times
            j_t = j_t + 1
            print 'round ', str(j_t), ' gridsearch cv----------------'
            pars_score = []
            learn_label_num = get_SimpleNumCode(learn_label)
            learn_fea_num = np.ones(learn_label_num.shape)
            
            gridcv = StratifiedKFold(n_splits = k_cv, shuffle = True)
            for i_train, i_valid in gridcv.split(learn_fea_num, learn_label_num):
                # obtain current training and validation data
                train_fea, valid_fea = learn_fea.iloc[i_train,:], learn_fea.iloc[i_valid,:]
                train_label, valid_label = learn_label.iloc[i_train], learn_label.iloc[i_valid]
                
                # learn the model
                # i_pars = {'par_name1':par1,'par_name2':par2,...,'par_nameN':parN}
                valid_pre, pars_new = run_RuleDiscovery(i_pars, train_fea, train_label, valid_fea)
                eval_dict = get_binary_eval(valid_pre, valid_label)
                grid_score = eval_dict[i_pars['grid_indicator']]
                pars_score.extend([grid_score])
            if np.mean(pars_score) > optdata['score']:
                optdata['pars'] = pars_new
                optdata['score'] = np.mean(pars_score)
        
        # Holdout testing
        # best_pars is a dict too
        best_pars = optdata['pars']
        test_pre, _ = run_RuleDiscovery(best_pars, learn_fea, learn_label, test_fea)
        
        # Evaluate the prediction
        res_now = get_binary_eval(test_pre, test_label)
        
        # Save results
        res_list.append(res_now)
        opt_pars_list.append(best_pars)
        
    return res_list, opt_pars_list

In [41]:
def get_SimpleNumCode(y):
    clf = LabelEncoder()
    clf.fit(y)
    return clf.transform(y)

In [42]:
# Run Rule Discovery
def run_RuleDiscovery(p, X, y, Z):
    # train and validate rules
    # v-values, f-features, c-classes, t-thresholds
    # rule = [(v,f,c),(v,f,c),...(v,f,c)]
    # threshold = [(t,c),(t,c),...,(t,c)]
    rule = get_rules(X, y, p)
    pos_rule, threshold, pos_name, neg_name = get_binary_validation(X, y, rule, p)
    print 'run_RuleDiscovery:', ' pos:', pos_name, '; neg:', neg_name
    p['threshold'] = threshold
    
    # test rules
    zpre = get_binary_predict(Z, pos_rule, threshold, pos_name, neg_name)
    
    return zpre, p

In [43]:
# Train rules
def get_rules(data, label, par):
    fea_name = data.columns.values # get names of features
    rule = []
    for i_f in fea_name:
        square_now = get_square(data[i_f], label)
        chi_tag_now = get_chi_test(square_now.values, par)
        if chi_tag_now == 0: # the current feature would not be considered(selected)
            continue 
        else:
            rule_now = get_fea_rules(square_now, i_f)
        rule.extend(rule_now)
    return rule

def get_square(fea, label):
    # this function is used to generate the contingency table for fea and class_name
    value_name = set(fea)  # get names of current features
    class_name = set(label) # get names of classes
    F = []
    for i_c in class_name:
        f = []
        for i_v in value_name:
            v_index = get_index(fea, i_v) # find index of current value
            c_index = get_index(label, i_c) # find index of current class
            f_v2c = len(set(v_index).intersection(c_index)) # frequence of each value-class pair  
            f.extend([f_v2c])
        F.extend([f])
    F = pd.DataFrame(F, columns = value_name) # transform double list into dataframe
    F = F.rename(lambda x:list(class_name)[x])
    return F

def get_index(lst_now, name_now):
    return [i for i,j in zip(range(len(lst_now)), lst_now) if j == name_now]

def get_chi_test(square, par):
    # this function is used to generate a tag according to threshold
    threshold_value = par['chi_value']
    threshold_name = par['chi_name']
    chi2, p, _, _ = scs.chi2_contingency(square)
    if threshold_name is 'chi2':
        chi_tag = 1 if threshold_value < chi2 else 0
    elif threshold_name is 'p':
        chi_tag = 1 if threshold_value < p else 0 
    return chi_tag

def get_fea_rules(square, fea_name):
    # this function is used to generate rules for each class from the current features 
    wsquare = get_weighted_square(square)
    best_class_name = wsquare.apply(np.argmax, axis = 0)
    rule_fea = [] # rules generated in current feature
    for i, i_v in enumerate(wsquare.columns):
        rule_fea.extend([(i_v, fea_name, best_class_name[i])]) # (value, feature, class)  
    return rule_fea

def get_weighted_square(Z):
    r = Z.apply(np.sum, axis = 1) # summation of columns
    for i in xrange(len(r)):
        Z.ix[i,:] = Z.ix[i,:].apply(lambda x:x/float(r[i])) # ratio of each element
    return Z


In [44]:
# Valide positive rule in binary-class case
def get_binary_validation(data, label, rule, par):
    # this function is used to determine the optimal threshold for the rule of the positive class
    class_name = list(set(label))
    label_lst = list(label)
    pos_name = class_name[0] if label_lst.count(class_name[0])<=label_lst.count(class_name[1]) else class_name[1]
    neg_name = class_name[0] if class_name[1] == pos_name else class_name[1]
    pos_rule = [(j_v, j_f) for j_v, j_f, j_c in rule if j_c == pos_name] # pos_rule:[(v,f),(v,f),...,(value,fea)]
    upper_fea = len(set([k_f for _,k_f in pos_rule])) # number of features in the rule of the current class
    
    best_res = 0
    threshold = 0 # threshold is a scala
    for i_w in xrange(upper_fea):
        pre_rule_num = get_micro_binary_predict(data, pos_rule)
        pre_label = [pos_name if i_r > i_w else neg_name for i_r in pre_rule_num]
        result_now = get_binary_eval(pre_label, label)
        print 'get_binary_validation:', par['indicator'], '-', result_now[par['indicator']], 'compared with', i_w
        print 'threshold from the last round:', threshold
        print
        if result_now[par['indicator']] > best_res:
            best_res = result_now[par['indicator']]
            threshold = i_w
    return pos_rule, threshold, pos_name, neg_name

def get_micro_binary_predict(data, pos_rule):
    # this function is used to calculate the number of matched rule 
    # (belonging to the same class) for each sample.
    pre_rule_num = []
    for i_data in xrange(len(data.index)):
        sample = zip(data.iloc[i_data,:], data.columns)
        match_num = get_binary_match(sample, pos_rule)
        pre_rule_num.extend([match_num])
    return pre_rule_num

def get_binary_match(sample, pos_rule):
    match_num = 0
    for i_v, i_f in sample:
        for j_v, j_f in pos_rule:
            if i_f == j_f and i_v == j_v:
                match_num = match_num + 1
    return match_num

In [45]:
# Test rules
def get_binary_predict(Z, pos_rule, threshold, pos_name, neg_name):
    print 'get_binary_predict:', ' pos:', pos_name, '; neg:', neg_name
    zpre = []
    for i in xrange(len(Z.index)):
        Z_i = zip(Z.iloc[i,:], Z.columns)
        #print 'Now:', get_binary_match(Z_i, pos_rule), 'compared with', threshold
        if get_binary_match(Z_i, pos_rule) >= threshold:
            zpre.extend([pos_name])
        else:
            zpre.extend([neg_name])
            
    return zpre

In [49]:
# Evaluate the performance
def get_binary_eval(p, y):
    # p and y are two list
    res_dict = {}
    
    p_num = get_SimpleNumCode(p)
    y_num = get_SimpleNumCode(y)
    res_dict['F1(Macro)'] = sm.f1_score(y_num, p_num, average='macro')*100
    res_dict['F1(Micro)'] = sm.f1_score(y_num, p_num, average='micro')*100
    res_dict['Acc'] = sm.accuracy_score(y_num, p_num)*100
    
    class_name = set(y)
    acc_each = []
    for i in class_name:
        p_index = get_index(p, i) # find index of current pre_label
        y_index = get_index(y, i) # find index of current true_label    
        acc_p2y = len(set(p_index).intersection(y_index))/float(len(y_index)) * 100 # acc of curren class
        acc_each.extend([(acc_p2y, i)]) # (accuracy, class)     
    res_dict['Acc_Macro'] = np.mean([a for a, c in acc_each])
    res_dict['GM'] = np.sqrt(np.prod([a for a, c in acc_each]))
    res_dict['Acc_each'] = acc_each
    
    return res_dict

In [50]:
def get_dict(d):
    list_final = []
    for i in d:
        list_now = i + '_' + str(d[i])
        list_final.append(list_now)
    return list_final

In [51]:
if __name__ == '__main__':
    data_name, k_holdout, k_cv, pars = run_Config()
    all_fea, all_label = run_load_data(data_name)
    res_list, opt_pars_list = run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars)
    print 'Each heldout cv result:'
    print '-----------------------'
    for i, j in zip(res_list,opt_pars_list):
        print get_dict(i), 'with hyper-parameters:'
        print get_dict(j)
        print '-----------------------'

Round  1  Holdout CV----------------------
round  1  gridsearch cv----------------
get_binary_validation: F1(Macro) - 46.0431654676 compared with 0
threshold from the last round: 0

get_binary_validation: F1(Macro) - 46.0431654676 compared with 1
threshold from the last round: 0

get_binary_validation: F1(Macro) - 20.4475524476 compared with 2
threshold from the last round: 0

get_binary_validation: F1(Macro) - 32.7841624773 compared with 3
threshold from the last round: 0

get_binary_validation: F1(Macro) - 39.2380865687 compared with 4
threshold from the last round: 0

get_binary_validation: F1(Macro) - 56.0511839273 compared with 5
threshold from the last round: 0

get_binary_validation: F1(Macro) - 56.8246716162 compared with 6
threshold from the last round: 5

get_binary_validation: F1(Macro) - 53.6777091522 compared with 7
threshold from the last round: 6

get_binary_validation: F1(Macro) - 48.3483483483 compared with 8
threshold from the last round: 6

get_binary_validation: F1(