In [1]:
# Basic models
import traceback
import numpy as np
import pandas as pd

# Data operation models
from sklearn.model_selection import StratifiedKFold
from sklearn import cross_validation  
from sklearn.model_selection import ParameterGrid 

# Classifier models
from sklearn.ensemble import RandomForestClassifier

# Evaluation models
import sklearn.metrics as sm



In [2]:
# Configurations for the model
def run_Config():
    data_name = 'Thoracic_Surgery_onehot.xlsx'
    k_holdout = 10
    k_cv = 5
    origin_pars = {
            'indicator_name': ['macc'],
            'k_smote': [1,3,5], # smote_hyper-par
            'r_newpoint': [1,2,3], # smote_hyper-par
            'n_estimators': [5,10,30,50],
            'max_depth': [2,4,6,8],
            'max_features': ['log2'],
            'class_weight':[{1:0.1,2:0.9},{1:0.2,2:0.8},{1:0.25,2:0.75},{1:0.4,2:0.6}]
            }
    pars = list(ParameterGrid(origin_pars))
    return data_name, k_holdout, k_cv, pars

In [3]:
# Load data
def run_load_data(data_name):
    all = pd.read_excel(data_name)
    all = all.values
    all_fea = all[:,:-1]
    all_label = get_normal_label(all[:,-1])
    return all_fea, all_label

def get_normal_label(y):
    y_uni = np.unique(np.array(y))
    for i in xrange(len(y_uni)):
        y[np.nonzero(y == y_uni[i])[0]] = i+1
    return y

In [4]:
# Run both HoldoutCV and GridSearchCV
def run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars):
    # HoldoutCV
    i_t = 0
    res_list = []
    opt_pars_list = []
    holdoutcv = StratifiedKFold(n_splits = k_holdout, shuffle = True)
    for i_learn, i_test in holdoutcv.split(all_fea, all_label):
        # Times
        i_t = i_t + 1
        # print 'Round ', str(i_t), ' Holdout CV----------------------'
        
        # Obtain current learning and heldout data
        learn_fea, test_fea = all_fea[i_learn], all_fea[i_test]
        learn_label, test_label = all_label[i_learn], all_label[i_test]
        
        # GridSearchCV
        j_t = 0
        optdata = {'score':0}
        for i_pars in pars:
            # times
            j_t = j_t + 1
            # print 'round ', str(j_t), ' gridsearch cv----------------'
            pars_score = []
            gridcv = StratifiedKFold(n_splits = k_cv, shuffle = True)
            for i_train, i_valid in gridcv.split(learn_fea, learn_label):
                # obtain current training and validation data
                train_fea, valid_fea = learn_fea[i_train], learn_fea[i_valid]
                train_label, valid_label = learn_label[i_train], learn_label[i_valid]
                # learn the model
                # i_pars = {'par_name1':par1,'par_name2':par2,...,'par_nameN':parN}
                valid_pre, pars_new = run_smote_model(i_pars, train_fea, train_label, valid_fea, 0)
                grid_score = run_validation(valid_pre, valid_label, i_pars['indicator_name'])
                pars_score.append(grid_score)
            if np.mean(pars_score) > optdata['score']:
                optdata['pars'] = pars_new
                optdata['score'] = np.mean(pars_score)
                    
        # Holdout testing
        # best_pars is a dict too
        best_pars = optdata['pars']
        test_pre, _ = run_smote_model(best_pars, learn_fea, learn_label, test_fea, 1)
        
        # Evaluate the prediction
        res_now = run_evaluation(test_pre, test_label)
        
        # Save results
        res_list.append(res_now)
        opt_pars_list.append(best_pars)
        
    return res_list, opt_pars_list

In [5]:
# Fit the model and make the prediction
def run_smote_model(p, X, y, Z, i_tag):
    if i_tag == 0:
        # run the smote to get new training data
        X_new, y_new, k_new, r_new = run_SMOTE(X, y, p['k_smote'], p['r_newpoint'])
        # unpdate k_smote and r_newpoint
        p['k_smote'] = k_new
        p['r_newpoint'] = r_new
    else:
        X_new, y_new = X, y
    
    # run basic model
    clf = RandomForestClassifier(n_estimators=p['n_estimators'], 
                                 max_depth=p['max_depth'], 
                                 max_features=p['max_features'], 
                                 class_weight=p['class_weight'])
    clf.fit(X_new,y_new)
    zpre = clf.predict(Z)
    # print 'SMOTE-RF is running...'
    return zpre, p

In [6]:
"""# Oversampling by SMOTE
def run_SMOTE(X, y, k, r):
    return X, y, k, r"""

'# Oversampling by SMOTE\ndef run_SMOTE(X, y, k, r):\n    return X, y, k, r'

In [7]:
# Oversampling by SMOTE
from sklearn.neighbors import NearestNeighbors as kNN
def run_SMOTE(X, y, k, r):
    
    # obtain postive and negative data
    v = np.unique(y)
    if len(v) == 2:
        if sum(y == v[0]) >= sum(y == v[1]):
            X_pos = X[y == v[1],:]
            X_neg = X[y == v[0],:]
            y_pos = y[y == v[1]]
            y_neg = y[y == v[0]]
        else:
            X_pos = X[y == v[0],:]
            X_neg = X[y == v[1],:]
            y_pos = y[y == v[0]]
            y_neg = y[y == v[1]]
    else:
        raise Exception("Not a binary-class!")    
    n_pos = X_pos.shape[0]
    n_neg = X_neg.shape[0]
    
    # constrain hyper-parameters to their suitable ranges
    if n_pos == 0:
        raise Exception("No positive samples!")
    elif k + 1 > n_pos:
        k = n_pos - 1
    else:
        while k > 1:
            while r > 1:
                if n_pos + n_pos*k*r >= n_neg:
                    r = r - 1
                else:
                    break   
            if n_pos + n_pos*k*r >= n_neg:
                k = k - 1
            else:
                break
    
    # find k nearest neighbors of each positive sample
    kNN_model = kNN(n_neighbors=k+1)
    kNN_model.fit(X_pos)
    i_neighbor = np.argsort(kNN_model.kneighbors(X_pos, return_distance = False), axis = 1)
    i_neighbor = i_neighbor[:,1:]
    
    # generate new positive samples and corresponding labels
    new_pos_mat = []
    for i in xrange(n_pos):
        now_sample = X_pos[i,:]
        for j in xrange(k):
                new_sample_mat = get_new_pos(now_sample, X_pos[i_neighbor[i,j],:], r)
                new_pos_mat.extend(new_sample_mat)
    new_pos_label = y_pos[0] * np.ones((len(new_pos_mat)))
    
    # combine the newly-generated ones to the original data
    X_new = np.concatenate((X_neg, X_pos, new_pos_mat), axis = 0)
    y_new = np.concatenate((y_neg, y_pos, new_pos_label), axis = 0)                                 
    i_shuffle = np.random.permutation(len(y_new))
    X_new = X_new[i_shuffle, :] 
    y_new = y_new[i_shuffle] 
    
    return X_new, y_new, k, r

def get_new_pos(point, neighbor, r):
    # Function to generate new r samples according to point and one of its neighbor
    new_points = []
    for i_r in xrange(r):
        new_point = point + (point - neighbor) * np.random.rand(point.shape[0])
        new_points.extend(np.array([new_point]))
    return new_points         


In [8]:
# Obtain the score
def run_validation(zpre, z, s_name):
    i_pos = np.nonzero(z == max(z))
    i_neg = np.nonzero(z == min(z))
    tpr = (1 - sm.hamming_loss(z[i_pos], zpre[i_pos]))*100
    tnr = (1 - sm.hamming_loss(z[i_neg], zpre[i_neg]))*100
    if s_name.lower() == 'macc':
        s = 0.5*(tpr+tnr)
    elif s_name.lower() == 'gm':
        s = np.sqrt(tpr*tnr)
    elif s_name.lower() == 'tpr':
        s = tpr
    else: # error
        s = sum((1 if i_pre == i_true else 0 for i_pre, i_true in zip(zpre,z)))/float(len(z))
    return s

In [9]:
# Evaluate the performance
def run_evaluation(p, y):
    res_dict = {}
    i_pos = np.nonzero(y == max(y))
    i_neg = np.nonzero(y == min(y))
    res_dict['TPR'] = (1 - sm.hamming_loss(y[i_pos], p[i_pos]))*100
    res_dict['TNR'] = (1 - sm.hamming_loss(y[i_neg], p[i_neg]))*100
    res_dict['MAcc'] = np.mean([res_dict['TPR'], res_dict['TNR']])
    res_dict['GM'] = np.sqrt(res_dict['TPR']*res_dict['TNR'])
    res_dict['F1(Macro)'] = sm.f1_score(y, p, average='macro')*100
    res_dict['F1(Micro)'] = sm.f1_score(y, p, average='micro')*100
    res_dict['Acc'] = sm.accuracy_score(y, p)*100
    return res_dict

In [10]:
def get_dict(d):
    list_final = []
    for i in d:
        list_now = i + '_' + str(d[i])
        list_final.append(list_now)
    return list_final

In [None]:
if __name__ == '__main__':
    data_name, k_holdout, k_cv, pars = run_Config()
    all_fea, all_label = run_load_data(data_name)
    res_list, opt_pars_list = run_doubleCV(all_fea, all_label, k_holdout, k_cv, pars)
    # print 'Each heldout cv result:'
    # print '-----------------------'
    for i, j in zip(res_list,opt_pars_list):
        print get_dict(i), 'with hyper-parameters:'
        print get_dict(j)
        print '-----------------------'