# 算法代码使用示例

## 以下是ABRS-SVM算法的使用说明，其他算法文件的流程与此相似

### 1. 载入库，包括三部分：代码处理需要的基本库；数据处理（交叉验证）用到的库；算法及性能评估用到的库。

In [37]:
# Basic models
import traceback
import numpy as np
import pandas as pd

# Data operation models
from sklearn.model_selection import StratifiedKFold
from sklearn import cross_validation
from sklearn.model_selection import ParameterGrid

# Classifier models
from sklearn.svm import SVC

# Evaluation models
import sklearn.metrics as sm

### 2.配置参数
#### 其中，origin_pars是一个字典，记录了每种使用参数的名称，及其取值范围。
#### 通过sci-kit learn库带的方法ParameterGrid，能生成origin_pars的参数在取值范围内的所有组合形式，存放在pars内
#### data_name是数据集名称，k_holdout是外层交叉验证的折数，k_cv是内层交叉验证的折数，pars是参数组

In [38]:
# Configurations for the model
def run_Config():
    data_name = 'Thoracic_Surgery_onehot.xlsx'
    k_holdout = 5
    k_cv = 3
    origin_pars = {
            'indicator_name': ['macc'],
            'ts': [5], # ABRS-SVM_hyper-par
            'tf': [5], # ABRS-SVM_hyper-par
            'rs': [0.9], # ABRS-SVM_hyper-par
            'rf': [0.5,0.7,0.9], # ABRS-SVM_hyper-par
            'kernel': ['rbf'],
            'C': [0.1,1,10,100],
            'gamma': [2,3],
            'class_weight':[{1:0.4,2:0.6},{1:0.5,2:0.5}]
            }
    pars = list(ParameterGrid(origin_pars))
    return data_name, k_holdout, k_cv, pars

### 3.载入数据集
#### run_load_data根据数据集名称，在当前文件夹下读入数据；
#### 数据文件默认存储在excel文件中，存在其他文件中的数据需要更改pd.read_excel()方法；
#### all_fea是不包含行名与列名的纯数据，一行一个样本，列为特征数，all_label是对应的类别标记；
#### get_normal_label()用于将不规则的一个序列按照元素值从小到大的顺序变成规则的序列；
#### 例如原始y = [2,5,7,3,4]，变化后为[0,3,4,1,2]；
#### 此处使用get_normal_label()规范label的取值。

In [39]:
# Load data
def run_load_data(data_name):
    all = pd.read_excel(data_name)
    all = all.values
    all_fea = all[:,:-1]
    all_label = get_normal_label(all[:,-1])
    return all_fea, all_label

def get_normal_label(y):
    y_uni = np.unique(np.array(y))
    for i in xrange(len(y_uni)):
        y[np.nonzero(y == y_uni[i])[0]] = i + 1 # i is OK for XGBoost or LGBM, but it must be 'i+1' for the rest methods
    return y

### 4.内外两层交叉验证
#### 外层交叉验证名为HoldoutCV，内层名为GridSearchCV
#### HoldoutCV将当前数据集按k_holdout折划分，每一轮内分为learn数据集与test数据集，前者作为数据集带入GridSearchCV训练最优超参数，后者留出来作为测试数据；
#### GridSearchCV将learn数据集进一步划分为train与validation样本，每一轮使用train样本训练模型，再用validation样本测试模型性能；
#### GridSearchCV只记录下表现最佳的模型（即，模型的超参数组合）；
#### pars里的每组参数都需要在GridSearchCV内验证得到结果，但只有具有最佳超参数组合的模型能带入后续测试环节；
#### run_ABRSSVM()是算法ABRS-SVM的实现方法。

In [40]:
# Run both HoldoutCV and GridSearchCV
def run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars):
    # HoldoutCV
    i_t = 0
    res_list = []
    opt_pars_list = []
    holdoutcv = StratifiedKFold(n_splits = k_holdout, shuffle = True)
    for i_learn, i_test in holdoutcv.split(all_fea, all_label):
        # Times
        i_t = i_t + 1
        # print 'Round ', str(i_t), ' Holdout CV----------------------'
        
        # Obtain current learning and heldout data
        learn_fea, test_fea = all_fea[i_learn], all_fea[i_test]
        learn_label, test_label = all_label[i_learn], all_label[i_test]
        
        # GridSearchCV
        j_t = 0
        optdata = {'score':0}
        for i_pars in pars:
            # times
            j_t = j_t + 1
            # print 'round ', str(j_t), ' gridsearch cv----------------'
            pars_score = []
            gridcv = StratifiedKFold(n_splits = k_cv, shuffle = True)
            for i_train, i_valid in gridcv.split(learn_fea, learn_label):
                # obtain current training and validation data
                train_fea, valid_fea = learn_fea[i_train], learn_fea[i_valid]
                train_label, valid_label = learn_label[i_train], learn_label[i_valid]
                # learn the model
                # i_pars = {'par_name1':par1,'par_name2':par2,...,'par_nameN':parN}
                valid_pre, pars_new = run_ABRSSVM(i_pars, train_fea, train_label, valid_fea)
                grid_score = run_validation(valid_pre, valid_label, i_pars['indicator_name'])
                pars_score.append(grid_score)
            if np.mean(pars_score) > optdata['score']:
                optdata['pars'] = pars_new
                optdata['score'] = np.mean(pars_score)
                    
        # Holdout testing
        # best_pars is a dict too
        best_pars = optdata['pars']
        test_pre, _ = run_ABRSSVM(best_pars, learn_fea, learn_label, test_fea)
        
        # Evaluate the prediction
        res_now = run_evaluation(test_pre, test_label)
        
        # Save results
        res_list.append(res_now)
        opt_pars_list.append(best_pars)
        
    return res_list, opt_pars_list

### 5. ABRS-SVM算法实现
#### run_ABRSSVM()是算法ABRS-SVM的实现方法。
#### 输入参数包括：参数组合p，数据矩阵X（用于训练），X的类别标记向量y，数据矩阵Z（用于测试或验证）
#### 该方法首先在原数据矩阵上分开正负类样本，接着在特征与样本层面都进行随机下采样，由每个下采样样本训练基分类器（此代码中是SVM），最后由基分类器投票预测Z里面的样本类别标记。

In [41]:
# Run ABRS-SVM
def run_ABRSSVM(p, X, y, Z):
    
    # split postive and negative data
    X_pos, y_pos, X_neg, y_neg = get_imbalance_split(X, y)
    n_pos_s = X_pos.shape[0]
    n_neg_s, n_neg_f = X_neg.shape
    
    # initialize hyper-parameters
    ts, tf, rf = p['ts'], p['tf'], p['rf']
    if p.has_key('rs'):
        rs = p['rs']
    else:
        if n_pos == 0:
            raise Exception("No positive samples!")
        else:
            rs = n_pos_s/float(n_neg_s)
        
    # generate sub-sampling slice for both samples and features
    slice_s, slice_f = get_slice(ts, tf, n_neg_s, n_neg_f, rs, rf)
    
    # train base learners
    svmclf = []
    for i in slice_s:
        for j in slice_f:
            # generate the currently used dataset
            X_now = np.concatenate((X_pos[:, j], X_neg[i,:][:,j]), axis = 0)
            y_now = np.concatenate((y_pos, y_neg[i]), axis = 0)
            i_shuffle = np.random.permutation(len(y_now))
            X_now = X_now[i_shuffle, :] 
            y_now = y_now[i_shuffle] 
            # train the classifier
            clf = SVC(C=p['C'], kernel=p['kernel'], gamma=p['gamma'], class_weight=p['class_weight'])
            clf.fit(X_now, y_now)
            svmclf.append(clf)
            
    # testing
    vote_mat = []
    for c in svmclf:
        vote_now = c.predict(Z[:, j])
        vote_mat.append(vote_now)
        
    # voting
    zpre = get_vote(vote_mat)
    
    return zpre, p

### 6.不平衡数据集判断及划分
#### 一些不平衡方法由于涉及采样与加权，当正类数目大于负类数目时可能出错，因此需要一个方法来纠正样本数目
#### get_imbalance_split()方法用于纠正两类样本数目，总是把样本上数更多的一方设为负类。且当类别数大于2就报错。

In [42]:
def get_imbalance_split(X, y):
    v = np.unique(y)
    if len(v) == 2:
        if sum(y == v[0]) >= sum(y == v[1]):
            X_pos = X[y == v[1],:]
            X_neg = X[y == v[0],:]
            y_pos = y[y == v[1]]
            y_neg = y[y == v[0]]
        else:
            X_pos = X[y == v[0],:]
            X_neg = X[y == v[1],:]
            y_pos = y[y == v[0]]
            y_neg = y[y == v[1]]
    else:
        raise Exception("Not a binary-class!")    
        
    return X_pos, y_pos, X_neg, y_neg

### 7. 切片（提取采样的下标）
#### get_slice()是run_ABRSSVM()方法中的一步，用于下采样样本及特征。
#### ts是采集的样本的次数，tf是采集的特征的次数。

In [43]:
def get_slice(ts, tf, ns, nf, rs, rf):
    # Function to generate slices of the undersampling samples and features
    # ts, tf: number of batches for sub-sample and sub-feature
    # ns, nf: number of original (negative) samples and features
    # rs, rf: ratio of undersampling samples and features
    import random
    
    sub_s = int(ns*rs)
    sub_f = int(nf*rf)
    slice_s = []
    slice_f = []
    for i in xrange(ts):
        for j in xrange(tf):
            now_s = range(ns)
            random.shuffle (now_s)
            slice_s.append(now_s[:sub_s])
            now_f = range(nf)
            random.shuffle (now_f)
            slice_f.append(now_f[:sub_f])
    return slice_s, slice_f                                      

### 8.投票
#### 当ABRS-SVM进入测试环节，不是像bagging那样简单aggregate样本，而是使用投票机制。
#### 每个基分类器对当前测试样本给出一个预测，相同预测值最高的预测将作为最终预测结果。

In [44]:
def get_vote(V):
    N = len(V[0]) # number of samples
    C = len(np.unique(V)) # number of classes
    zpre = []
    for i in xrange(N):
        sample_now = [v[i] for v in V]
        pre_now = max(sample_now, key = sample_now.count)
        zpre.append(pre_now)   
    return np.array(zpre)

### 9. 验证
#### run_validation()方法用于验证预测类别标记向量zpre和真实标记向量z之间的差异；
### s_name是用于选择使用哪种验证方法的参数。

In [45]:
# Obtain the score
def run_validation(zpre, z, s_name):
    i_pos = np.nonzero(z == max(z))
    i_neg = np.nonzero(z == min(z))
    tpr = (1 - sm.hamming_loss(z[i_pos], zpre[i_pos]))*100
    tnr = (1 - sm.hamming_loss(z[i_neg], zpre[i_neg]))*100
    if s_name.lower() == 'macc':
        s = 0.5*(tpr+tnr)
    elif s_name.lower() == 'gm':
        s = np.sqrt(tpr*tnr)
    elif s_name.lower() == 'tpr':
        s = tpr
    else: # error
        s = sum((1 if i_pre == i_true else 0 for i_pre, i_true in zip(zpre,z)))/float(len(z))
    return s

### 10.性能评估
#### 采用了以下评估指标：True Positive Rate (TPR), True Negative Rate (TNR),
#### MAcc = 0.5*(TPR+TNR), GM = sqrt(TPR*TNR),
#### F1(macro & micro)
#### Acc

In [46]:
# Evaluate the performance
def run_evaluation(p, y):
    res_dict = {}
    i_pos = np.nonzero(y == max(y))
    i_neg = np.nonzero(y == min(y))
    res_dict['TPR'] = (1 - sm.hamming_loss(y[i_pos], p[i_pos]))*100
    res_dict['TNR'] = (1 - sm.hamming_loss(y[i_neg], p[i_neg]))*100
    res_dict['MAcc'] = np.mean([res_dict['TPR'], res_dict['TNR']])
    res_dict['GM'] = np.sqrt(res_dict['TPR']*res_dict['TNR'])
    res_dict['F1(Macro)'] = sm.f1_score(y, p, average='macro')*100
    res_dict['F1(Micro)'] = sm.f1_score(y, p, average='micro')*100
    res_dict['Acc'] = sm.accuracy_score(y, p)*100
    return res_dict

### 11. 用于显示出生成的最嫁分类结果背后对应的超参数组合，以及最优分类结果是多少。

In [47]:
def get_dict(d):
    list_final = []
    for i in d:
        list_now = i + '_' + str(d[i])
        list_final.append(list_now)
    return list_final

### 12. 调用01-11方法的主方法。
#### 首先调用配置参数的方法。
#### 接着载入数据集。
#### 之后调用交叉验证方法，并得到结果。
#### 最后打印字典。

In [None]:
if __name__ == '__main__':
    data_name, k_holdout, k_cv, pars = run_Config()
    all_fea, all_label = run_load_data(data_name)
    res_list, opt_pars_list = run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars)
    #print 'Each heldout cv result:'
    #print '-----------------------'
    for i, j in zip(res_list,opt_pars_list):
        print get_dict(i), 'with hyper-parameters:'
        print get_dict(j)
        print '-----------------------'