### Multi-class Threshold-based Rule Discovery

In [7]:
# Basic models
import traceback
import numpy as np
import pandas as pd
import scipy.stats as scs

# Data operation models
from sklearn.model_selection import StratifiedKFold
from sklearn import cross_validation
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import LabelEncoder

# Evaluation models
import sklearn.metrics as sm



In [8]:
# Configurations for the model
def run_Config():
    data_name = 'Thoracic_test.xlsx'
    k_holdout = 5
    k_cv = 5
    origin_pars = {
        'indicator_name':['Acc_Macro'],
        'indicator':['Acc_Macro'],
        'chi_name': ['p','chi2'],
        'chi_value':[0.05]
    }
    pars = list(ParameterGrid(origin_pars))
    return data_name, k_holdout, k_cv, pars

In [9]:
# Load data
def run_load_data(data_name):
    all = pd.read_excel(data_name)
    label = all.iloc[:, -1]    # get labels
    data = all.iloc[:, :-1] # get data
    # some preprocessing...
    return data, label

In [10]:
# Run both HoldoutCV and GridSearchCV
def run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars):
    # HoldoutCV
    i_t = 0
    res_list = []
    opt_pars_list = []
    all_label_num = get_SimpleNumCode(all_label)
    all_fea_num = np.ones(all_label_num.shape)
    
    holdoutcv = StratifiedKFold(n_splits = k_holdout, shuffle = True)
    for i_learn, i_test in holdoutcv.split(all_fea_num, all_label_num):
        # Times
        i_t = i_t + 1
        print 'Round ', str(i_t), ' Holdout CV----------------------'
        
        # Obtain current learning and heldout data
        learn_fea, test_fea = all_fea.iloc[i_learn,:], all_fea.iloc[i_test,:]
        learn_label, test_label = all_label.iloc[i_learn], all_label.iloc[i_test]
        
        # GridSearchCV
        j_t = 0
        optdata = {'score':0}
        for i_pars in pars:
            # times
            j_t = j_t + 1
            print 'round ', str(j_t), ' gridsearch cv----------------'
            pars_score = []
            learn_label_num = get_SimpleNumCode(learn_label)
            learn_fea_num = np.ones(learn_label_num.shape)
            
            gridcv = StratifiedKFold(n_splits = k_cv, shuffle = True)
            for i_train, i_valid in gridcv.split(learn_fea_num, learn_label_num):
                # obtain current training and validation data
                train_fea, valid_fea = learn_fea.iloc[i_train,:], learn_fea.iloc[i_valid,:]
                train_label, valid_label = learn_label.iloc[i_train], learn_label.iloc[i_valid]
                
                # learn the model
                # i_pars = {'par_name1':par1,'par_name2':par2,...,'par_nameN':parN}
                valid_pre, pars_new = run_RuleDiscovery(i_pars, train_fea, train_label, valid_fea)
                eval_dict = run_evaluation(valid_pre, valid_label)
                grid_score = eval_dict[i_pars['indicator_name']]
                pars_score.extend([grid_score])
            if np.mean(pars_score) > optdata['score']:
                optdata['pars'] = pars_new
                optdata['score'] = np.mean(pars_score)
        
        # Holdout testing
        # best_pars is a dict too
        best_pars = optdata['pars']
        test_pre, _ = run_RuleDiscovery(best_pars, learn_fea, learn_label, test_fea)
        
        # Evaluate the prediction
        res_now = run_evaluation(test_pre, test_label)
        
        # Save results
        res_list.append(res_now)
        opt_pars_list.append(best_pars)
        
    return res_list, opt_pars_list

In [11]:
def get_SimpleNumCode(y):
    clf = LabelEncoder()
    clf.fit(y)
    return clf.transform(y)

In [12]:
# Run Rule Discovery
def run_RuleDiscovery(p, X, y, Z):
    # train and validate rules
    # v-values, f-features, c-classes, t-thresholds
    # rule = [(v,f,c),(v,f,c),...(v,f,c)]
    # threshold = [(t,c),(t,c),...,(t,c)]
    rule = get_rules(X, y, p)
    threshold = get_validation(X, y, rule, p)
    p['threshold'] = threshold
    
    # test rules
    zpre = get_rule_predict(Z, rule, threshold)
    
    return zpre, p

In [13]:
# Train rules
def get_rules(data, label, par):
    fea_name = data.columns.values # get names of features
    rule = []
    for i_f in fea_name:
        square_now = get_square(data[i_f], label)
        chi_tag_now = get_chi_test(square_now.values, par)
        if chi_tag_now == 0: # the current feature would not be considered(selected)
            continue 
        else:
            rule_now = get_fea_rules(square_now, i_f)
        rule.extend(rule_now)
    return rule # rule is a list with structure like: rule = [iterm1, iterm2, ..., itermN], where itermi = (value, feature, class) 

def get_square(fea, label):
    # this function is used to generate the contingency table for fea and class_name
    value_name = set(fea)  # get names of current features
    class_name = set(label) # get names of classes
    F = []
    for i_c in class_name:
        f = []
        for i_v in value_name:
            v_index = get_index(fea, i_v) # find index of current value
            c_index = get_index(label, i_c) # find index of current class
            f_v2c = len(set(v_index).intersection(c_index)) # frequence of each value-class pair  
            f.extend([f_v2c])
        F.extend([f])
    F = pd.DataFrame(F, columns = value_name) # transform double list into dataframe
    F = F.rename(lambda x:list(class_name)[x])
    return F

def get_index(lst_now, name_now):
    return [i for i,j in zip(range(len(lst_now)), lst_now) if j == name_now]

def get_chi_test(square, par):
    # this function is used to generate a tag according to threshold
    threshold_value = par['chi_value']
    threshold_name = par['chi_name']
    chi2, p, _, _ = scs.chi2_contingency(square)
    if threshold_name is 'chi2':
        chi_tag = 1 if threshold_value < chi2 else 0
    elif threshold_name is 'p':
        chi_tag = 1 if threshold_value < p else 0 
    return chi_tag

def get_fea_rules(square, fea_name):
    # this function is used to generate rules for each class from the current features 
    wsquare = get_weighted_square(square)
    best_class_name = wsquare.apply(np.argmax, axis = 0)
    rule_fea = [] # rules generated in current feature
    for i, i_v in enumerate(wsquare.columns):
        rule_fea.extend([(i_v, fea_name, best_class_name[i])]) # (value, feature, class)  
    return rule_fea

def get_weighted_square(Z):
    r = Z.apply(np.sum, axis = 1) # summation of columns
    for i in xrange(len(r)):
        Z.ix[i,:] = Z.ix[i,:].apply(lambda x:x/float(r[i])) # ratio of each element
    return Z


In [14]:
# Valide rules
def get_validation(data, label, rule, par):
    # this function is used to determine the optimal threshold for each rule of the class
    threshold = {}
    class_rule = {}
    class_name = set(label)
    for i_c in class_name:
        class_rule[i_c] = [(j_v, j_f, j_c) for j_v, j_f, j_c in rule if j_c == i_c] # rules of the current class
        threshold[i_c] = range(len(set([k_f for _,k_f,_ in class_rule[i_c]]))) # number of features in the rule of the current class
    threshold_grid = list(ParameterGrid(threshold))
    print threshold_grid
    print
    
    best_res = 0
    best_threshold = None
    for i_t in threshold_grid:
        pre_now = {}
        for c in class_name: # each class makes a prediction for the data
            threshold_now = i_t[c]
            pre_rule_num = get_micro_predict(data, class_rule[c])
            pre_now[c] = [c if i_r > threshold_now else None for i_r in pre_rule_num]
        pre_label = get_voted_pre(pre_now)
        result_now = run_evaluation(pre_label, label)
        print 'get_validation:', par['indicator'], '-', result_now[par['indicator']], 'compared with', threshold_now
        print 'threshold from the last round:', threshold_now
        print
        if result_now[par['indicator']] > best_res:
            best_res = result_now[par['indicator']]
            best_threshold = i_t # i_t is a dict with sturcture like i_t = {c1: threshold1, c2: threshold2, ..., cN:thresholdN}
    return best_threshold

def get_micro_predict(data, rule):
    # this function is used to calculate the number of matched rule 
    # (belonging to the same class) for each sample.
    pre_rule_num = []
    for i_data in xrange(len(data.index)):
        sample = zip(data.iloc[i_data,:], data.columns)
        match_num = get_match(sample, rule)
        pre_rule_num.extend([match_num])
    return pre_rule_num

def get_match(sample, rule):
    match_num = 0
    for i_v, i_f in sample:
        for j_v, j_f, _ in rule:
            if i_f == j_f and i_v == j_v:
                match_num = match_num + 1
    return match_num

def get_voted_pre(pdict):
    tempor_list = [[j for j in pdict[i]] for i in pdict]
    N = len(tempor_list[0])
    plist = [[n[m] for n in tempor_list] for m in xrange(N)]
    return [max(k, key = k.count) for k in plist]


In [15]:
# Test rules
def get_rule_predict(Z, rule, threshold):
    class_name = set([c for _, _, c in rule])
    zpre = []
    for i in xrange(len(Z.index)):
        Z_i = zip(Z.iloc[i,:], Z.columns)
        vote = []
        for j in class_name:
            rule_now = [(v,f,c) for v, f, c in rule if c == j]
            threshold_now = threshold[j]
            print 'Now:', get_match(Z_i, rule_now), 'compared with', threshold_now, ' | ', j
            if get_match(Z_i, rule_now) >= threshold_now:
                vote.extend([j])
        zpre.extend([max(vote, key = vote.count)])
    return zpre

In [16]:
# Evaluate the performance
def run_evaluation(p, y):
    # p and y are two list
    res_dict = {}
    acc_each = []
    class_name = set(y)
    for i in class_name:
        p_index = get_index(p, i) # find index of current pre_label
        y_index = get_index(y, i) # find index of current true_label    
        acc_p2y = len(set(p_index).intersection(y_index))/float(len(y_index)) * 100 # acc of curren class
        acc_each.extend([(acc_p2y, i)]) # (accuracy, class)     
    res_dict['Acc_Macro'] = np.mean([a for a, c in acc_each])
    res_dict['Acc_each'] = acc_each
    return res_dict

In [17]:
def get_dict(d):
    list_final = []
    for i in d:
        list_now = i + '_' + str(d[i])
        list_final.append(list_now)
    return list_final

In [18]:
if __name__ == '__main__':
    data_name, k_holdout, k_cv, pars = run_Config()
    all_fea, all_label = run_load_data(data_name)
    res_list, opt_pars_list = run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars)
    print 'Each heldout cv result:'
    print '-----------------------'
    for i, j in zip(res_list,opt_pars_list):
        print get_dict(i), 'with hyper-parameters:'
        print get_dict(j)
        print '-----------------------'

Round  1  Holdout CV----------------------
round  1  gridsearch cv----------------
[{u'T': 0, u'F': 0}, {u'T': 1, u'F': 0}, {u'T': 2, u'F': 0}, {u'T': 3, u'F': 0}, {u'T': 4, u'F': 0}, {u'T': 5, u'F': 0}, {u'T': 6, u'F': 0}, {u'T': 7, u'F': 0}, {u'T': 8, u'F': 0}, {u'T': 9, u'F': 0}, {u'T': 0, u'F': 1}, {u'T': 1, u'F': 1}, {u'T': 2, u'F': 1}, {u'T': 3, u'F': 1}, {u'T': 4, u'F': 1}, {u'T': 5, u'F': 1}, {u'T': 6, u'F': 1}, {u'T': 7, u'F': 1}, {u'T': 8, u'F': 1}, {u'T': 9, u'F': 1}, {u'T': 0, u'F': 2}, {u'T': 1, u'F': 2}, {u'T': 2, u'F': 2}, {u'T': 3, u'F': 2}, {u'T': 4, u'F': 2}, {u'T': 5, u'F': 2}, {u'T': 6, u'F': 2}, {u'T': 7, u'F': 2}, {u'T': 8, u'F': 2}, {u'T': 9, u'F': 2}, {u'T': 0, u'F': 3}, {u'T': 1, u'F': 3}, {u'T': 2, u'F': 3}, {u'T': 3, u'F': 3}, {u'T': 4, u'F': 3}, {u'T': 5, u'F': 3}, {u'T': 6, u'F': 3}, {u'T': 7, u'F': 3}, {u'T': 8, u'F': 3}, {u'T': 9, u'F': 3}, {u'T': 0, u'F': 4}, {u'T': 1, u'F': 4}, {u'T': 2, u'F': 4}, {u'T': 3, u'F': 4}, {u'T': 4, u'F': 4}, {u'T': 5, u'F': 

KeyboardInterrupt: 