In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
c_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
c_time_m = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

In [None]:
# 参数
Version = 'V3.2'
EPOCH = 1000
INPUT_X = 'Features_94_343.csv'
INPUT_Y = 'Values_94.csv'
INPUT_TITLE = 'Title_343.csv'
FIND_SPLIT = True
INPUT_SPLIT = '100.0_Split.csv'
SAVE_NAME = 'DTC_FeatureImportance_'+c_time+'.png'
PLOT_NAME3 = 'DTC_ROC_'+c_time
PLOT_NAME4 = 'DTC_CV_LOOP_'+c_time+'.png'
SUPTITLE = 'DTC on '+INPUT_X+' and '+INPUT_Y+'\nEPOCH:'+str(EPOCH)
LOG_NAME = 'DTC_Log_'+c_time+'.txt'
CV_LOOP_EPOCH = 300
FOLD = 6
TRAIN_TEST_SPLIT = 0.85
INPUT_LRIF = 'LRIF_148_343.csv'
INPUT_LRIF_LIST = 'LRIF_title_148.csv'
LRIF_NAME = 'LRIF_Test_DTC_'+c_time+'.csv'

In [None]:
X = np.loadtxt(INPUT_X, delimiter=',')
y = np.loadtxt(INPUT_Y)
title = np.loadtxt(INPUT_TITLE, dtype='str')
print('X:', X.shape, '   y:', y.shape)
lrif_test = np.loadtxt(INPUT_LRIF, delimiter=',')
print(lrif_test.shape)
lrif_list = np.loadtxt(INPUT_LRIF_LIST, dtype='str', delimiter='#').reshape(lrif_test.shape[0], 1)
count_m = np.zeros((lrif_test.shape[0], 2))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
import graphviz
import joblib

In [None]:
import os
from pathlib import Path
if FIND_SPLIT:
    DIR = 'DTC_'+Version+'_FindSplit_'+c_time
else:
    DIR = 'DTC_'+Version+'_TestSplit_'+c_time
os.mkdir(DIR)
PLOT_NAME4 = Path('.', DIR, PLOT_NAME4)
LRIF_NAME = Path('.', DIR, LRIF_NAME)

In [None]:
# 打乱和切分数据集
point = round(X.shape[0]*TRAIN_TEST_SPLIT)
if not FIND_SPLIT:
    permutation = np.loadtxt(INPUT_SPLIT).astype(int).flatten().tolist()
    train_idx = []
    test_idx = []
    for i in range(X.shape[0]):
        if i in permutation:
            train_idx.append(i)
        else:
            test_idx.append(i)
    X_train = X[train_idx, :]
    y_train = y[train_idx]
    X_test = X[test_idx, :]
    y_test = y[test_idx]
    perm_train = np.random.permutation(X_train.shape[0])
    X_train = X_train[perm_train, :]
    y_train = y_train[perm_train]
    perm_test = np.random.permutation(X_test.shape[0])
    X_test = X_test[perm_test, :]
    y_test = y_test[perm_test]
else:
    permutation = np.random.permutation(y.shape[0])
    X_train = X[permutation[:point], :]
    y_train = y[permutation[:point]]
    X_test = X[permutation[point:], :]
    y_test = y[permutation[point:]]

In [None]:
def cm_plot(y, yp, path): 
    cm = confusion_matrix(y, yp) #混淆矩阵
    plt.figure(figsize=(5,5), dpi=300)
    plt.matshow(cm, cmap=plt.cm.Greens) #画混淆矩阵图，配色风格使用cm.Greens，更多风格请参考官网。
    plt.colorbar() #颜色标签 
    for x in range(len(cm)): #数据标签
        for y in range(len(cm)):
            plt.annotate(cm[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
    plt.ylabel('True label', fontsize=15) #坐标轴标签
    plt.xlabel('Predicted label', fontsize=15) #坐标轴标签
    plt.savefig(path, bbox_inches='tight', dpi=300)
    plt.clf()
    plt.close('all')

In [None]:
# 搭建随机森林模型
tuned_parameters = [{'min_impurity_decrease': [0.01], 'max_depth': [4, 5, None], 
                     'max_features': [0.95], 'max_leaf_nodes': [4, 5, None], 'class_weight': ['balanced'], 
                     'min_samples_leaf': [2, 3], 'min_samples_split': [1, 2, 3]}]
dtc = DecisionTreeClassifier()
clf = GridSearchCV(dtc, tuned_parameters, verbose=1, scoring=None, cv=6, n_jobs=-1)
clf.fit(X, y)
clf_new = clf.best_estimator_
best_p = clf.best_params_
paras = clf_new.get_params()
print(best_p)
acc_unique = clf_new.score(X_test, y_test)
print('Current accuracy:', acc_unique)

In [None]:
f_i = np.zeros((title.shape[0], 1))
mse_list = []
acc_list = []
mean_acc_list = []
min_mse = 999.9
for _ in range(EPOCH):
    # 打乱和切分数据集
    if FIND_SPLIT:
        permutation = np.random.permutation(y.shape[0])
        X_train = X[permutation[:point], :]
        y_train = y[permutation[:point]]
        X_test = X[permutation[point:], :]
        y_test = y[permutation[point:]]
    else:
        perm_train = np.random.permutation(X_train.shape[0])
        X_train = X_train[perm_train, :]
        y_train = y_train[perm_train]
        perm_test = np.random.permutation(X_test.shape[0])
        X_test = X_test[perm_test, :]
        y_test = y_test[perm_test]
    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    clf_new = DecisionTreeClassifier()
    for k, v in paras.items():
        clf_new.set_params(**{k: v})
    # 拟合模型
    clf_new.fit(X_train, y_train)
    # 计算损失
    y_pred = clf_new.predict(X_test)
    lrif_predict = clf_new.predict(lrif_test)
    for i in range(lrif_test.shape[0]):
        if lrif_predict[i]==1:
            count_m[i, 0] += 1
        else:
            count_m[i, 1] += 1
    acc_count = 0
    for i in range(X_test.shape[0]):
        if y_pred[i]==y_test[i]:
            acc_count += 1
    acc = acc_count*100/X_test.shape[0]
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
    acc_list.append(acc)
    mean_acc = np.mean(acc_list)
    mean_acc_list.append(mean_acc)
    print('Round:', _+1, "MSE: %.4f" % mse, '  Accuracy: %.4f' % acc, '  current mean acc: %.4f' % mean_acc)
    # 计算特征重要度
    feature_importance = np.array(clf_new.feature_importances_).reshape(title.shape[0], 1)
    f_i = f_i+feature_importance*acc
    if mse<min_mse or mse<0.15:
        if mse<min_mse:
            min_mse = mse
        clf_name = str(round(min_mse, 4))+'_DTC.pkl'
        clf_name = Path('.', DIR, clf_name)
        joblib.dump(clf_new, clf_name)
        pred = clf_new.predict(X_test)
        PLOT_NAME2 = str(round(acc, 4))+'_DTC_ConfusionMatrix_'+c_time+'.png'
        PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
        cm_plot(y_test, pred, PLOT_NAME2)
        dot_data = tree.export_graphviz(clf_new, out_file=None,
                      feature_names=title,  
                      class_names=['1', '0'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
        graph = graphviz.Source(dot_data)  
        graph.render(filename=str(round(acc, 4))+'_DTC', directory=DIR, format='png')
        # 保存切分数据
        if FIND_SPLIT:
            SPLIT_NAME = str(round(acc, 4))+'_Split.csv'
            SPLIT_NAME = Path('.', DIR, SPLIT_NAME)
            np.savetxt(SPLIT_NAME, np.array(permutation[:point]).reshape(point, 1), fmt='%d')
print('Mean accuracy:', np.mean(acc_list))

In [None]:
cv_acc_list = []
cv_mean_acc_list = []
for _ in range(CV_LOOP_EPOCH):
    # 打乱训练集并分割
    permutation = np.random.permutation(y.shape[0])
    X = X[permutation, :]
    y = y[permutation]
    clf_brand_new = DecisionTreeClassifier()
    for k, v in paras.items():
        clf_brand_new.set_params(**{k: v})
    scores = cross_val_score(clf_brand_new, X, y, cv=FOLD, n_jobs=-1)
    cv_acc_list.append(np.mean(scores))
    cv_mean_acc_list.append(np.mean(cv_acc_list))
    print('round:', _+1, '  accuarcy: %.4f' % np.mean(scores), '  current mean acc: %.4f' % np.mean(cv_acc_list))
print('Mean accuracy of CV-Loop:', np.mean(cv_acc_list))

In [None]:
from scipy.stats import norm
mu = np.mean(acc_list)
sigma = np.std(acc_list)
acc_array = np.array(acc_list).reshape(len(acc_list), 1)
acc_sorted = np.sort(acc_array, axis=0)
x_arg = np.linspace(1, acc_sorted.shape[0], acc_sorted.shape[0])
plt.figure(figsize=(6, 6), dpi=250)
n, bins, patches = plt.hist(acc_sorted, bins=30, density=1)
acc_N = norm.pdf(bins, mu, sigma)
plt.plot(bins, acc_N)
plt.title('Distribution of Acc\nMean Acc: '+str(round(np.mean(acc_list), 3))+'  Max Acc: '+str(round(max(acc_list), 3)), fontsize=18)
plt.ylabel('Possibility', fontsize=15)
plt.xlabel('Acc', fontsize=15)
PLOT_NAME6 = 'Acc_Distribution_DTC_NormalLoop_'+c_time+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6)

In [None]:
mu = np.mean(cv_acc_list)
sigma = np.std(cv_acc_list)
acc_array = np.array(cv_acc_list).reshape(len(cv_acc_list), 1)
acc_sorted = np.sort(acc_array, axis=0)
x_arg = np.linspace(1, acc_sorted.shape[0], acc_sorted.shape[0])
plt.figure(figsize=(6, 6), dpi=250)
n, bins, patches = plt.hist(acc_sorted, bins=30, density=1)
acc_N = norm.pdf(bins, mu, sigma)
plt.plot(bins, acc_N)
plt.title('Distribution of Acc\nMean Acc: '+str(round(np.mean(cv_acc_list), 3))+'  Max Acc: '+str(round(max(cv_acc_list), 3)), fontsize=18)
plt.ylabel('Possibility', fontsize=15)
plt.xlabel('CV Mean Acc', fontsize=15)
PLOT_NAME7 = 'Acc_Distribution_DTC_CVLoop_'+c_time+'.png'
PLOT_NAME7 = Path('.', DIR, PLOT_NAME7)
plt.savefig(PLOT_NAME7)

In [None]:
plt.figure(figsize=(11, 8), dpi=250)
f_i_temp = f_i.copy()
# 计算相对重要度
f_i_temp[:, 0] = 100.0 * (f_i_temp[:, 0]/(max(f_i_temp[:, 0])-min(f_i_temp[:, 0])))
sorted_idx = np.argsort(-f_i_temp[:, 0])
pos = np.arange(sorted_idx.shape[0]) + .5
plt.barh(pos[:10, ], f_i_temp[sorted_idx[:10, ], 0].flatten().tolist(), align='center')
plt.yticks(pos[:10, ], title[sorted_idx[:10, ]])
plt.xlabel('Relative Importance', fontsize=16)
plt.title('Variable Importance (First 10)', fontsize=18)
SUPTITLE = SUPTITLE+' Mean Acc: '+str(np.mean(acc_list))
plt.suptitle(SUPTITLE, fontsize=18)
SAVE_NAME = Path('.', DIR, SAVE_NAME)
plt.savefig(SAVE_NAME)

In [None]:
plt.figure(figsize=(11,11), dpi=300)
plt.subplot(211)
x_idx = np.linspace(1, len(acc_list), len(acc_list)).tolist()
plt.scatter(x_idx, acc_list, color='r')
plt.plot(x_idx, mean_acc_list, 'b:')
plt.title('Acc Curve of Normal Loop', fontsize=18)
plt.ylabel('Accuracy', fontsize=15)
plt.xlabel('Epoch', fontsize=15)
plt.subplot(212)
plt.title('Acc Curve of CV-Loop', fontsize=18)
cv_x_idx = np.linspace(1, len(cv_acc_list), len(cv_acc_list)).tolist()
plt.scatter(cv_x_idx, cv_acc_list, color='r')
plt.plot(cv_x_idx, cv_mean_acc_list, 'b:')
plt.ylabel('Accuracy', fontsize=15)
plt.xlabel('Epoch', fontsize=15)
plt.suptitle('DecisionTreeClassifier Accuracy-Epoch Curves\n'+'Epoch of normal loop: '+
             str(EPOCH)+'   Epoch of CV-loop: '+str(CV_LOOP_EPOCH), fontsize=20)
plt.savefig(PLOT_NAME4)

In [None]:
count_m = np.hstack((lrif_list, count_m))
np.savetxt(LRIF_NAME, count_m, fmt='%s', delimiter=',')

In [None]:
LOG_NAME = Path('.', DIR, LOG_NAME)
f1 = open(LOG_NAME, 'w+')
f1.write('DecisionTreeClassifier Log\n\n')
f1.write('Input data: '+INPUT_X+' and '+INPUT_Y+'\n')
f1.write('Data Shape:'+str(X.shape)+', '+str(y.shape)+'\n\n')
f1.write('Epoch of normal loop: '+str(EPOCH)+'\n')
f1.write('Epoch of CV-loop: '+str(CV_LOOP_EPOCH)+'\n')
f1.write('Fold number of CV-loop: '+str(FOLD)+'\n\n')
f1.write('Best parameters from CV: '+str(best_p)+'\n\n')
f1.write('Classifier parameters:\n')
f1.write(str(paras)+'\n\n')
f1.write('Mean accuracy of Normal Loop: '+str(np.mean(acc_list))+'\n')
f1.write('Mean accuracy of CV-Loop: '+str(np.mean(cv_acc_list))+'\n\n')
f1.write('Mean MSE:'+str(np.mean(mse_list))+'\n')
for i in range(30):
    f1.write('name:'+str(title[sorted_idx[i, ], ])+'   value:'+str(f_i_temp[sorted_idx[i, ], 0])+'\n')
f1.close()

In [None]:
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold
n_samples, n_features = X.shape
from sklearn.model_selection import ShuffleSplit
rand_state = np.random.randint(5000)
print('Random state:', rand_state)
cv = ShuffleSplit(n_splits=10, test_size=.15, random_state=rand_state)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

plt.figure(figsize=(8,5), dpi=300)
fig, ax = plt.subplots()

clf_list = []
for i in range(10):
    clf_list.append(DecisionTreeClassifier())

for i, (train, test) in enumerate(cv.split(X, y)):
    for k, v in paras.items():
        # clf_brand_new_1.set_params(**{k: v})
        clf_list[i].set_params(**{k: v})
    clf_list[i].fit(X[train], y[train])
    viz = plot_roc_curve(clf_list[i], X[test], y[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="DTC Receiver operating characteristic curve\n"+'Random state: '+str(rand_state))
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()
PLOT_NAME3_n = PLOT_NAME3+'_'+str(rand_state)+'.png'
PLOT_NAME3_n = Path('.', DIR, PLOT_NAME3_n)
plt.savefig(PLOT_NAME3_n, bbox_inches='tight', dpi=300)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf_new, X, y, cv=3)
scores