# 基于Scikit-Learn API的XGBoost  Regressor模板
最后更新时间：2022.02.05 戴以恒   
### 异步进程池并行版  

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
import time
c_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
c_time_m = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

In [2]:
# 参数
# ======== System Setup ========
Version = 'V5.1.0-Alpha'
EPOCH = 160
CORE_NUM = 32       # 总运行轮数=EPOCH/CORE_NUM
# ======== Fit Data Input ========
S_N = 93
F_N = 6
INPUT_X = 'Features_'+str(S_N)+'_'+str(F_N)+'.csv'
INPUT_Y = 'Values_True_'+str(S_N)+'.csv'
INPUT_TITLE = 'Title_'+str(F_N)+'.csv'
BEGIN_INDEX = 0
END_INDEX_PLUS_ONE = None   # 设为None或者调成0和特征数量，即可使用全部特征
INPUT_SMILES = 'Smiles_'+str(S_N)+'.csv'
TITLE_DATE = '220205'
# ======== Find Split Settings ========
INPUT_SPLIT = 'R2_0.6962_XGB-Split.csv'
FIND_SPLIT = True
CAL_MAE_LOOP = False
SAVE_MODEL = False
# ======== Other Fitting Settings ========
TRAIN_TEST_SPLIT = 0.85
SORT_SAMPLE = False
TEST_SPLIT_OOB = False
R2_HIGHER_LIMIT = 0.70
CONFIDENCE = 0.95
SAVE_RESULTS_OF_EVERY_ROUND = False
# ======== Extra Prediction ========
PREDICT_MORE = True
PREDICT_DATA = 'Features_3780_6.csv'

# ======== Data Output ========
SAVE_NAME = 'XGBoostRegressor_'+c_time+'.png'
SUPTITLE = 'XGBoost on '+INPUT_X+' and '+INPUT_Y+'\nEPOCH:'+str(EPOCH)+'\n'

In [3]:
if END_INDEX_PLUS_ONE != None:
    X = np.loadtxt(INPUT_X, delimiter=',')[:, BEGIN_INDEX:END_INDEX_PLUS_ONE]
    title = np.loadtxt(INPUT_TITLE, dtype='str', delimiter=',', comments='!')[BEGIN_INDEX:END_INDEX_PLUS_ONE, ]
else:
    X = np.loadtxt(INPUT_X, delimiter=',')
    title = np.loadtxt(INPUT_TITLE, dtype='str', delimiter=',', comments='!')
y = np.loadtxt(INPUT_Y)

print('X:', X.shape, '   y:', y.shape)
if PREDICT_MORE:
    X_p = np.loadtxt(PREDICT_DATA, delimiter=',')
    y_p = []


X: (93, 6)    y: (93,)


In [4]:
with open(PREDICT_DATA, 'r') as file:
    for i, line in enumerate(file):
        try:
            # 尝试将每一行的数据转换为浮点数
            numbers = [float(num) for num in line.strip().split(',')]
        except ValueError as e:
            print(f"Error in line {i}: {line}")
            print(f"Exception: {e}")
            break  # 找到第一个错误就停止，或者删除 break 以查找更多错误

In [5]:
# # 将原始数据按标签值从小到大排序：
# if SORT_SAMPLE:
#     sort_permu = np.argsort(y).flatten().tolist()
#     X = X[sort_permu, :]
#     y = y[sort_permu, ]
#     smiles = smiles[sort_permu, ]

In [6]:
from sklearn import model_selection
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

In [7]:
import os
from pathlib import Path
if FIND_SPLIT:
    DIR = 'XGB-R_'+Version+'_FindSplit_'+str(X.shape[1])+'_Fs_'+c_time
else:
    DIR = 'XGB-R_'+Version+'_TestSplit_'+str(X.shape[1])+'_Fs_'+c_time
os.mkdir(DIR)

In [8]:
# 打乱和切分数据集
point = round(X.shape[0]*TRAIN_TEST_SPLIT)
if not FIND_SPLIT:
    permutation = np.loadtxt(INPUT_SPLIT).astype(int).flatten().tolist()
    train_idx = []
    test_idx = []
    for i in range(X.shape[0]):
        if i in permutation:
            train_idx.append(i)
        else:
            test_idx.append(i)
    X_train_o = X[train_idx, :]
    y_train_o = y[train_idx]
    X_test = X[test_idx, :]
    y_test = y[test_idx]
else:
    permutation = np.random.permutation(y.shape[0])
    train_idx = permutation[:point]
    test_idx = permutation[point:]
    X_train = X[train_idx, :]
    y_train = y[train_idx]
    X_test = X[test_idx, :]
    y_test = y[test_idx]

In [9]:
# 初始化MAE矩阵
if CAL_MAE_LOOP:
    mae_m = np.zeros((X.shape[0], 4))
    mae_count = np.zeros((X.shape[0], 2))

In [10]:
# Create clf (sklean的API)
clf = XGBRegressor(n_estimators=350, learning_rate=0.03, max_depth=8, verbosity=0, booster='gbtree', 
                   reg_alpha=np.exp(-3), reg_lambda=np.exp(-3), gamma=np.exp(-5), 
                   subsample=0.5, objective= 'reg:squarederror', n_jobs=1)
paras = clf.get_params()
mse_list = []
mae_list = []
r2_list = []

f_i = np.zeros((title.shape[0], 1))
max_r2 = -999.9

In [11]:
def draw_scatter_nomral(true_train, pred_train, true_test, pred_test, X_train, X_test):
    global DIR, FIND_SPLIT, CAL_MAE_LOOP, R2_HIGHER_LIMIT, mse_list, mae_list, r2_list, f_i, mae_m, mae_count, max_r2
    fig = plt.figure(figsize=(7, 7.8), dpi=300)
    ax = fig.add_axes([0.15, 0.09, 0.78, 0.74])
    ax.scatter(true_train, pred_train, s=25, alpha=0.75)
    ax.scatter(true_test, pred_test, s=25, alpha=0.75)
    left_limit = min(min(true_test)-1, min(true_train)-1)
    right_limit = max(max(true_test)+1, max(true_train)+1)
    ax.plot([left_limit, right_limit], [left_limit, right_limit], 'r:')
    ax.plot([left_limit, right_limit], [left_limit+1, right_limit+1], 'y:')
    ax.plot([left_limit, right_limit], [left_limit-1, right_limit-1], 'y:')
    ax.legend(['Correct', 'Correct+1', 'Correct-1', 'Train', 'Test'], loc='upper left', shadow=True, fontsize=17)
    ax.set_xlabel('True Values', fontsize=18)
    ax.set_ylabel('Prediction Values', fontsize=18)
    train_mse = mean_squared_error(true_train, pred_train)
    train_mae = mean_absolute_error(true_train, pred_train)
    train_r2 = r2_score(true_train, pred_train)
    test_mse = mean_squared_error(true_test, pred_test)
    test_mae = mean_absolute_error(true_test, pred_test)
    test_r2 = r2_score(true_test, pred_test)
    plt.suptitle('XGBoost Regressor True-Predict Scatter'+
                 '\nTrain R^2: '+str(round(train_r2, 5))+'  Test R^2: '+str(round(test_r2, 5))+
                 '\nTrain MSE: '+str(round(train_mse, 5))+'  Test MSE: '+str(round(test_mse, 5))+
                 '\nTrain MAE: '+str(round(train_mae, 5))+'  Test MAE: '+str(round(test_mae, 5)), fontsize=18)
    PLOT_NAME3 = str(round(test_r2, 4))+'-R2_XGBoost_Scatter_'+c_time+'.png'
    PLOT_NAME3 = Path('.', DIR, PLOT_NAME3)
    plt.savefig(PLOT_NAME3)
    plt.close()

In [12]:
def XGB_Fit(X, y, X_train, y_train, X_test, y_test, paras):
    clf_new = XGBRegressor()
    for k, v in paras.items():
        clf_new.set_params(**{k: v})
    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    # 拟合模型
    clf_new.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=150, verbose=False)
    # 计算损失
    y_pred = clf_new.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    temp = [mse, mae, r2]
    print('   MSE: %.5f' % mse, '  MAE: %.5f' % mae, '  R^2: %.5f' % r2)
    return (temp, clf_new)

In [13]:
r_l = []
split_l = []
if FIND_SPLIT:
    test_idx_m = []
for _ in range(int(EPOCH/CORE_NUM)):
    print('Round', CORE_NUM*(_)+1, 'Begin:')
    pool = Pool(CORE_NUM)
    for __ in range(CORE_NUM):
        if FIND_SPLIT:
            permutation = np.random.permutation(y.shape[0])
            train_idx = permutation[:point]
            test_idx = permutation[point:]
            X_train = X[train_idx, :]
            y_train = y[train_idx]
            X_test = X[test_idx, :]
            y_test = y[test_idx]
            split_l.append(train_idx)
            test_idx_m.append(test_idx)
        else:
            if TEST_SPLIT_OOB:
                idx_t = np.random.choice(X_train_o.shape[0], size=X_train_o.shape[0], replace=True).flatten().tolist()
                X_train = X_train_o[idx_t, :]
                y_train = y_train_o[idx_t]
                # perm_train = np.random.permutation(X_train.shape[0])
                split_l.append(train_idx)
            else:
                perm_train = np.random.permutation(X_train.shape[0])
                X_train = X_train_o[perm_train, :]
                y_train = y_train_o[perm_train]
                split_l.append(train_idx)
        r = pool.apply_async(XGB_Fit, args=(X, y, X_train, y_train, X_test, y_test, paras,))
        r_l.append(r)
    pool.close()
    pool.join()

Round 1 Begin:
   MSE: 73.05345   MAE: 7.08062   R^2: -0.03840
   MSE: 53.47496   MAE: 5.97571   R^2: 0.44020
   MSE: 22.34929   MAE: 3.95157   R^2: 0.84627
   MSE: 49.05646   MAE: 5.60508   R^2: 0.63710
   MSE: 119.71411   MAE: 8.90537   R^2: 0.35601
   MSE: 182.31468   MAE: 10.91950   R^2: 0.55797
   MSE: 50.66761   MAE: 5.69484   R^2: -0.13801
   MSE: 61.56971   MAE: 6.34784   R^2: 0.71965
   MSE: 67.00067   MAE: 7.18905   R^2: 0.26855
   MSE: 73.76135   MAE: 7.03946   R^2: 0.65746
   MSE: 94.99803   MAE: 7.72201   R^2: 0.66729
   MSE: 88.91935   MAE: 7.52015   R^2: 0.60976
   MSE: 71.79292   MAE: 7.27307   R^2: 0.44019
   MSE: 48.00733   MAE: 5.63939   R^2: 0.79986
   MSE: 149.01686   MAE: 9.02032   R^2: 0.26276
   MSE: 73.45971   MAE: 7.44025   R^2: 0.72963
Round 17 Begin:
   MSE: 93.85876   MAE: 7.85562   R^2: 0.19989
   MSE: 79.34862   MAE: 7.78238   R^2: 0.33759
   MSE: 28.98686   MAE: 4.34488   R^2: 0.73059
   MSE: 105.42902   MAE: 7.54297   R^2: 0.40286
   MSE: 57.87669   MAE

   MSE: 34.18161   MAE: 4.90063   R^2: 0.60778
   MSE: 85.90661   MAE: 6.87483   R^2: 0.57441
   MSE: 79.46922   MAE: 7.66709   R^2: 0.66560
   MSE: 47.47368   MAE: 5.54043   R^2: 0.73402
   MSE: 63.27452   MAE: 7.18105   R^2: 0.55384
   MSE: 66.67643   MAE: 7.23488   R^2: 0.75931
   MSE: 70.62501   MAE: 7.65627   R^2: 0.56837
Round 177 Begin:
   MSE: 81.25859   MAE: 7.86004   R^2: 0.46645
   MSE: 47.35563   MAE: 5.69290   R^2: -0.16632
   MSE: 75.17260   MAE: 6.31546   R^2: 0.45036
   MSE: 56.21713   MAE: 6.01512   R^2: 0.40111
   MSE: 82.97029   MAE: 7.31125   R^2: 0.63020
   MSE: 196.93973   MAE: 10.58454   R^2: -0.02969
   MSE: 32.51251   MAE: 4.94906   R^2: 0.25523
   MSE: 34.69933   MAE: 4.83440   R^2: 0.69708
   MSE: 53.51854   MAE: 5.81343   R^2: 0.49057
   MSE: 32.03933   MAE: 4.47532   R^2: 0.86146
   MSE: 55.83350   MAE: 6.77206   R^2: 0.68153
   MSE: 45.92047   MAE: 4.84267   R^2: 0.57216
   MSE: 71.59215   MAE: 6.77564   R^2: 0.45711
   MSE: 54.82480   MAE: 6.16675   R^2: 

   MSE: 136.90431   MAE: 8.15692   R^2: 0.26161
   MSE: 61.24522   MAE: 6.16561   R^2: 0.45223
   MSE: 154.28268   MAE: 8.84880   R^2: 0.22994
   MSE: 23.58549   MAE: 3.73573   R^2: 0.89264
   MSE: 124.90211   MAE: 7.98313   R^2: 0.60335
   MSE: 102.26500   MAE: 7.52485   R^2: 0.69237
   MSE: 106.08669   MAE: 7.80277   R^2: 0.56163
   MSE: 125.58061   MAE: 8.09687   R^2: 0.36602
   MSE: 94.62880   MAE: 7.41379   R^2: 0.54431
   MSE: 111.45336   MAE: 7.33219   R^2: 0.49467
   MSE: 55.33915   MAE: 5.74488   R^2: 0.68946
   MSE: 50.62592   MAE: 5.99960   R^2: 0.56924
   MSE: 139.84535   MAE: 8.60132   R^2: 0.41803
Round 353 Begin:
   MSE: 50.28634   MAE: 6.08536   R^2: 0.60728
   MSE: 80.59304   MAE: 7.21564   R^2: 0.39348
   MSE: 87.73344   MAE: 7.24286   R^2: 0.58504
   MSE: 44.37438   MAE: 5.25432   R^2: 0.32238
   MSE: 110.94019   MAE: 8.60921   R^2: 0.54404
   MSE: 49.11128   MAE: 6.18752   R^2: 0.62894
   MSE: 79.48191   MAE: 7.80348   R^2: 0.41896
   MSE: 57.36038   MAE: 6.36520   

In [14]:
full_m = []
for i in range(len(r_l)):
    r = r_l[i]
    results = r.get()
    temp = results[0]
    mse = temp[0]
    mae = temp[1]
    r2 = temp[2]
    mse_list.append(mse)
    mae_list.append(mae)
    r2_list.append(r2)
    clf_new = results[1]
    # 计算特征重要度
    feature_importance = np.array(clf_new.feature_importances_).reshape(title.shape[0], 1)
    f_i = f_i+feature_importance*r2
    
    train_idx = split_l[i]
    test_idx = []
    for j in range(X.shape[0]):
        if j not in train_idx:
            test_idx.append(j)
    X_train = X[train_idx, :]
    y_train = y[train_idx]
    X_test = X[test_idx, :]
    y_test = y[test_idx]
    y_full_pred = clf_new.predict(X)
    full_m.append(y_full_pred)
    
    if PREDICT_MORE:
        y_p_pred = clf_new.predict(X_p)
        y_p.append(y_p_pred)
    
    # 计算所有样本的MAE
    if CAL_MAE_LOOP:
        for i in range(len(y_full_pred)):
            if i in train_idx:
                mae_m[i, 0] += abs(y_full_pred[i]-y[i, ])
                mae_m[i, 2] += y_full_pred[i]-y[i, ]
                mae_count[i, 0] += 1
            elif i in test_idx:
                mae_m[i, 1] += abs(y_full_pred[i]-y[i, ])
                mae_m[i, 3] += y_full_pred[i]-y[i, ]
                mae_count[i, 1] += 1
    if r2>max_r2 or r2>R2_HIGHER_LIMIT:
        if r2>max_r2:
            max_r2 = r2
        # 保存切分数据
        if FIND_SPLIT:
            SPLIT_NAME = 'R2_'+str(round(r2, 4))+'_XGB-Split.csv'
            SPLIT_NAME = Path('.', DIR, SPLIT_NAME)
            np.savetxt(SPLIT_NAME, np.array(train_idx).reshape(point, 1), fmt='%d')
        if SAVE_MODEL:
            clf_name = str(round(r2, 4))+'_XGB.pkl'
            clf_name = Path('.', DIR, clf_name)
            joblib.dump(clf_new, clf_name)
        pred_train = clf_new.predict(X_train)
        pred_test = clf_new.predict(X_test)
        true_train = y_train.flatten().tolist()
        true_test = y_test.flatten().tolist()
        draw_scatter_nomral(true_train, pred_train, true_test, pred_test, X_train, X_test)

In [15]:
X_p

array([[ 8.3000000e+01,  1.2916000e+01,  5.1060000e+01,  5.5324298e+02,
        -3.3132700e+00,  2.3330600e+00],
       [ 3.0000000e+01,  8.7520000e+00,  1.3500000e+01,  1.9827573e+02,
        -6.9017100e+00,  2.3543400e+00],
       [ 8.0000000e+01,  1.1160000e+01,  5.0550000e+01,  4.9707367e+02,
        -1.1802400e+00,  6.7060700e+00],
       ...,
       [ 7.7000000e+01,  1.6814000e+01,  1.6810000e+01,  4.4884724e+02,
        -8.0000000e-05,  4.0810000e-01],
       [ 6.8000000e+01,  9.0410000e+00,  3.2550000e+01,  4.0847437e+02,
        -5.2639000e-01,  5.2415000e-01],
       [ 4.3000000e+01,  9.4140000e+00,  7.3000000e+00,  2.4334461e+02,
        -1.0906490e+01,  4.6163100e+00]])

In [16]:
if PREDICT_MORE:
    y_p = np.transpose(np.array(y_p))
    print(y_p.shape)
    y_p_std = []
    y_p_mean = []
    for i in range(y_p.shape[0]):
        y_p_std.append(np.std(y_p[i, :]))
        y_p_mean.append(np.mean(y_p[i, :]))
    y_p_std = np.array(y_p_std).reshape(y_p.shape[0], 1)
    y_p_mean = np.array(np.exp(y_p_mean)).reshape(y_p.shape[0], 1)
    
    # Combine mean and std into one array without including SMILES
    out_p = np.hstack((y_p_mean, y_p_std))
    
    # Update the title to reflect the new structure
    title_p = np.array(['Predicted TPACS(GM)', 'STD of Prediction']).reshape(1, 2)
    
    # Stack the title and the data
    out_p = np.vstack((title_p, out_p))
    
    # Construct the filename for saving
    save_name = f'XGBoost_00_Extra_Prediction_Data_{str(EPOCH)}_Rounds_{c_time}.csv'
    save_name = Path('.', DIR, save_name)
    
    # Save the data to a CSV file
    np.savetxt(save_name, out_p, delimiter=',', fmt='%s')

(3780, 480)


In [17]:
    save_name = f'XGBoost_00_Extra_Prediction_Data_{str(EPOCH)}_Rounds_{c_time}.csv'
    save_name = Path('.', DIR, save_name)
    
    # Save the data to a CSV file
    np.savetxt(save_name, y_p, delimiter=',', fmt='%s')

In [18]:
if CAL_MAE_LOOP:
    MAE_NAME = 'XGBoost_00_MAE_Counts_'+c_time+'.txt'
    MAE_NAME = Path('.', DIR, MAE_NAME)
    for i in range(mae_m.shape[0]):
        mae_m[i, 0] /= mae_count[i, 0]
        mae_m[i, 2] /= mae_count[i, 0]
        mae_m[i, 1] /= mae_count[i, 1]
        mae_m[i, 3] /= mae_count[i, 1]
    idx = np.argsort(-mae_m[:, 1])
    f2 = open(MAE_NAME, 'w+')
    f2.write('MAE Calculation of all samples:\n')
    f2.write('Total Rounds: '+str(EPOCH)+'\n\n')
    for i in range(len(idx)):
        index = idx[i]
        f2.write('Molecule No.'+str(index+1)+'\n')
        f2.write('Smiles: '+smiles[index, ]+'\n')
        f2.write('    True Values: '+str(y[index, ])+'\n')
        f2.write('    Training MAE: '+str(mae_m[index, 0])+'\n')
        f2.write('    Training ME: '+str(mae_m[index, 2])+'\n')
        f2.write('    Testing MAE: '+str(mae_m[index, 1])+'\n')
        f2.write('    Testing ME: '+str(mae_m[index, 3])+'\n\n\n')
    f2.close()

In [19]:
full_m = np.array(full_m)
mean_list = []
std_list = []
for i in range(full_m.shape[1]):
    mean_list.append(np.mean(full_m[:, i]))
    std_list.append(np.std(full_m[:, i]))
fig = plt.figure(figsize=(10, 8), dpi=300)
ax = fig.add_axes([0.11, 0.08, 0.88, 0.815])
true_y = y.flatten().tolist()
sc = ax.scatter(true_y, mean_list, alpha=0.55, c=std_list, cmap='viridis', marker='o')
left_limit = min(min(true_y)-1, min(mean_list)-1)
right_limit = max(max(true_y)+1, max(mean_list)+1)
ax.plot([left_limit, right_limit], [left_limit, right_limit], color='#B22222', linestyle=':', linewidth = '2')
ax.plot([left_limit, right_limit], [left_limit+1, right_limit+1], color='#FFA500', linestyle=':', linewidth = '2')
ax.plot([left_limit, right_limit], [left_limit-1, right_limit-1], color='#FFA500', linestyle=':', linewidth = '2')
ax.legend(['Correct', 'Correct+1', 'Correct-1', 'Mean of Prediction'], loc='upper left', fontsize=17, shadow=True)
ax.set_xlabel('True Values', fontsize=17)
ax.set_ylabel('Mean Values of Prediction', fontsize=17)
plt.suptitle('Scatter of Mean Prediction vs True Values of '+str(EPOCH)+' Rounds\n'+
             'Mean:  MSE: '+str(round(np.mean(mse_list), 4))+
             '  MAE: '+str(round(np.mean(mae_list), 4))+
             '  R^2: '+str(round(np.mean(r2_list), 4)), fontsize=21)
cb = plt.colorbar(sc)
cb.set_label('Standard Deviation of Prediction Values', fontsize=17)
plt.grid(which='major', color='#D5D5D5', alpha=0.5)
save_name = 'XGBoost_01b_Mean_Prediction_Distribution_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)
if SAVE_RESULTS_OF_EVERY_ROUND:
    save_name = 'XGBoost_01a_Prediction_Data_'+str(EPOCH)+'_Rounds_'+c_time+'.csv'
    save_name = Path('.', DIR, save_name)
    np.savetxt(save_name, full_m, delimiter=',', fmt='%s')

In [20]:
me_plot_list = []
for i in range(len(mean_list)):
    me_plot_list.append(mean_list[i]-true_y[i])
fig = plt.figure(figsize=(12, 6), dpi=300)
ax = fig.add_axes([0.08, 0.11, 0.95, 0.815])
x_idx = np.linspace(0, len(me_plot_list)-1, len(me_plot_list))
sc = ax.scatter(x_idx, me_plot_list, alpha=1.0, c=std_list, cmap='viridis', marker='o')
ax.set_xlabel('Sample ID', fontsize=17)
ax.set_ylabel('Deviation of Prediction Values', fontsize=17)
ax.plot([0, len(me_plot_list)], [0, 0], color='#B22222', linestyle=':', linewidth = '2')
ax.plot([0, len(me_plot_list)], [1, 1], color='#FFA500', linestyle=':', linewidth = '2')
ax.plot([0, len(me_plot_list)], [-1, -1], color='#FFA500', linestyle=':', linewidth = '2')
# ax.legend(['Correct', 'Correct+1', 'Correct-1', 'Mean of Prediction'], loc='upper left', fontsize=17, shadow=True)
cb = plt.colorbar(sc)
cb.set_label('Standard Deviation of Prediction Values', fontsize=17)
plt.grid(which='major', color='#D5D5D5', alpha=0.5)
plt.suptitle('XGBoost Scatter of Prediction Deviation vs Sample ID of '+str(EPOCH)+' Rounds', fontsize=21)
save_name = 'XGBoost_01c_Prediction_Deviation_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)

In [21]:
if FIND_SPLIT:
    test_idx_m = np.array(test_idx_m)
    save_name = 'XGBoost_02a_Test_Index_'+c_time+'.csv'
    save_name = Path('.', DIR, save_name)
    np.savetxt(save_name, test_idx_m, fmt='%d', delimiter=',')
    test_data_m = []
    for i in range(X.shape[0]):
        test_data_m.append([])
    for i in range(test_idx_m.shape[0]):
        for j in range(test_idx_m.shape[1]):
            test_data_m[test_idx_m[i, j]].append(full_m[i, test_idx_m[i, j]])
    test_upper_l = []
    test_lower_l = []
    test_mean_l = []
    test_median_l = []
    test_std_l = []
    test_me_l = []
    for i in range(X.shape[0]):
        test_upper_l.append(max(test_data_m[i]))
        test_lower_l.append(min(test_data_m[i]))
        test_mean_l.append(np.mean(test_data_m[i]))
        test_median_l.append(np.median(test_data_m[i]))
        test_std_l.append(np.std(test_data_m[i]))
        test_me_l.append(test_mean_l[i]-true_y[i])

In [22]:
if FIND_SPLIT:
    fig = plt.figure(figsize=(10, 8), dpi=300)
    ax = fig.add_axes([0.11, 0.08, 0.88, 0.815])
    sc = ax.scatter(true_y, test_mean_l, alpha=0.55, c=test_std_l, cmap='viridis', marker='o')
    left_limit = min(min(true_y)-1, min(test_mean_l)-1)
    right_limit = max(max(true_y)+1, max(test_mean_l)+1)
    ax.plot([left_limit, right_limit], [left_limit, right_limit], color='#B22222', linestyle=':', linewidth = '2')
    ax.plot([left_limit, right_limit], [left_limit+1, right_limit+1], color='#FFA500', linestyle=':', linewidth = '2')
    ax.plot([left_limit, right_limit], [left_limit-1, right_limit-1], color='#FFA500', linestyle=':', linewidth = '2')
    ax.legend(['Correct', 'Correct+1', 'Correct-1', 'Mean of Test Prediction'], loc='upper left', fontsize=17, shadow=True)
    ax.set_xlabel('True Values', fontsize=17)
    ax.set_ylabel('Mean Values of Test Prediction', fontsize=17)
    plt.suptitle('Scatter of Mean Test Prediction vs True of '+str(EPOCH)+' Rounds\n'+
                 'Mean Test:  MSE: '+str(round(np.mean(mse_list), 4))+
                 '  MAE: '+str(round(np.mean(mae_list), 4))+
                 '  R^2: '+str(round(np.mean(r2_list), 4)), fontsize=21)
    cb = plt.colorbar(sc)
    cb.set_label('Standard Deviation of Test Predictions', fontsize=17)
    plt.grid(which='major', color='#D5D5D5', alpha=0.5)
    save_name = 'XGBoost_02b_Mean_Test_Prediction_Distribution_'+c_time+'.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)

In [23]:
if FIND_SPLIT:
    fig = plt.figure(figsize=(12, 6), dpi=300)
    ax = fig.add_axes([0.08, 0.11, 0.95, 0.815])
    sc = ax.scatter(x_idx, test_me_l, alpha=1.0, c=test_std_l, cmap='viridis', marker='o')
    ax.set_xlabel('Sample ID', fontsize=17)
    ax.set_ylabel('Deviation of Test Predictions', fontsize=17)
    ax.plot([0, len(test_me_l)], [0, 0], color='#B22222', linestyle=':', linewidth = '2')
    ax.plot([0, len(test_me_l)], [1, 1], color='#FFA500', linestyle=':', linewidth = '2')
    ax.plot([0, len(test_me_l)], [-1, -1], color='#FFA500', linestyle=':', linewidth = '2')
    # ax.legend(['Correct', 'Correct+1', 'Correct-1', 'Mean of Prediction'], loc='upper left', fontsize=17, shadow=True)
    cb = plt.colorbar(sc)
    cb.set_label('Standard Deviation of Test Predictions', fontsize=17)
    plt.grid(which='major', color='#D5D5D5', alpha=0.5)
    plt.suptitle('XGBoost Scatter of Test Prediction Deviation vs Sample ID of '+str(EPOCH)+' Rounds', fontsize=21)
    save_name = 'XGBoost_02c_Test_Prediction_Deviation_'+c_time+'.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)

In [24]:
if FIND_SPLIT:
    fig = plt.figure(figsize=(36, 9), dpi=300)
    ax = fig.add_axes([0.05, 0.11, 0.90, 0.815])
    ax.set_xlim(-1, X.shape[0])
    p1_std = []
    p2_std = []
    n1_std = []
    n2_std = []
    for i in range(X.shape[0]):
        p1_std.append(test_mean_l[i]+test_std_l[i])
        p2_std.append(test_mean_l[i]+test_std_l[i]*2)
        n1_std.append(test_mean_l[i]-test_std_l[i])
        n2_std.append(test_mean_l[i]-test_std_l[i]*2)
        ax.plot([x_idx[i], x_idx[i]], [test_upper_l[i], test_lower_l[i]], linestyle='-', color='grey', linewidth=1.4, alpha=0.5)
    sc = ax.scatter(x_idx, test_upper_l, alpha=1.0, marker='_', color='grey', s=30)
    sc = ax.scatter(x_idx, test_lower_l, alpha=1.0, marker='_', color='grey', s=30)
    sc = ax.scatter(x_idx, test_mean_l, alpha=1.0, marker='_', color='black', s=16)
    sc = ax.scatter(x_idx, test_median_l, alpha=1.0, marker='_', color='lightgrey', s=16)
    sc = ax.scatter(x_idx, p1_std, alpha=1.0, marker='2', color='gold', s=20)
    sc = ax.scatter(x_idx, n1_std, alpha=1.0, marker='1', color='gold', s=20)
    sc = ax.scatter(x_idx, p2_std, alpha=1.0, marker='2', color='peru', s=20)
    sc = ax.scatter(x_idx, n2_std, alpha=1.0, marker='1', color='peru', s=20)
    sc = ax.scatter(x_idx, true_y, alpha=1.0, marker='x', color='lawngreen', s=24)
    ax.set_xlabel('Sample ID', fontsize=17)
    ax.set_ylabel('Deviation of Test Predictions', fontsize=17)
    # ax.legend(['Correct', 'Correct+1', 'Correct-1', 'Mean of Prediction'], loc='upper left', fontsize=17, shadow=True)
    plt.grid(which='major', color='#D5D5D5', alpha=0.5)
    plt.suptitle('XGBoost Scatter of Test Prediction Deviation vs Sample ID of '+str(EPOCH)+' Rounds', fontsize=21)
    save_name = 'XGBoost_02d_Test_Prediction_Line_'+c_time+'.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)

In [25]:
from scipy.stats import norm
mu_mse = np.mean(mse_list)
sigma_mse = np.std(mse_list)
mse_array = np.array(mse_list).reshape(len(mse_list), 1)
mse_sorted = np.sort(mse_array, axis=0)
x_arg_mse = np.linspace(1, mse_sorted.shape[0], mse_sorted.shape[0])
mu_mae = np.mean(mae_list)
sigma_mae = np.std(mae_list)
mae_array = np.array(mae_list).reshape(len(mae_list), 1)
mae_sorted = np.sort(mae_array, axis=0)
x_arg_mae = np.linspace(1, mae_sorted.shape[0], mae_sorted.shape[0])
mu_r2 = np.mean(r2_list)
sigma_r2 = np.std(r2_list)
r2_array = np.array(r2_list).reshape(len(r2_list), 1)
r2_sorted = np.sort(r2_array, axis=0)
x_arg_r2 = np.linspace(1, r2_sorted.shape[0], r2_sorted.shape[0])

fig = plt.figure(figsize=(8, 8), dpi=300)
ax = fig.add_axes([0.11, 0.08, 0.85, 0.84])
n, bins_mse, patches = ax.hist(mse_sorted, bins=30, density=1, facecolor='#4682B4', edgecolor='#505050', alpha=0.75, linewidth=1.6)
mse_N = norm.pdf(bins_mse, mu_mse, sigma_mse)
plt.plot(bins_mse, mse_N, color='#483D8B', linestyle=':', linewidth = '2')
plt.suptitle('Distribution of XGBoost MSE of '+str(EPOCH)+' Rounds', fontsize=22)
ax.set_ylabel('Counts', fontsize=17)
ax.set_xlabel('MSE', fontsize=17)
save_name = 'XGBoost_03a_MSE_Distribution_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)

fig = plt.figure(figsize=(8, 8), dpi=300)
ax = fig.add_axes([0.11, 0.08, 0.85, 0.84])
n, bins_mae, patches = ax.hist(mae_sorted, bins=30, density=1, facecolor='#3CB371', edgecolor='#006400', alpha=0.75, linewidth=1.6)
mae_N = norm.pdf(bins_mae, mu_mae, sigma_mae)
plt.plot(bins_mae, mae_N, color='#B2F200', linestyle=':', linewidth = '2')
plt.suptitle('Distribution of XGBoost MAE of '+str(EPOCH)+' Rounds', fontsize=22)
ax.set_ylabel('Counts', fontsize=17)
ax.set_xlabel('MAE', fontsize=17)
save_name = 'XGBoost_03b_MAE_Distribution_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)

fig = plt.figure(figsize=(8, 8), dpi=300)
ax = fig.add_axes([0.11, 0.08, 0.85, 0.84])
n, bins_r2, patches = ax.hist(r2_sorted, bins=30, density=1, facecolor='#FF6347', edgecolor='#FF4500', alpha=0.75, linewidth=1.6)
r2_N = norm.pdf(bins_r2, mu_r2, sigma_r2)
plt.plot(bins_r2, r2_N, color='#B22222', linestyle=':', linewidth = '2')
plt.suptitle('Distribution of XGBoost R^2 of '+str(EPOCH)+' Rounds', fontsize=22)
ax.set_ylabel('Counts', fontsize=17)
ax.set_xlabel('R^2', fontsize=17)
save_name = 'XGBoost_03c_R2_Distribution_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)

In [26]:
if FIND_SPLIT:
    upper_conf = []
    lower_conf = []
    conf_delta = []
    for i in range(len(test_mean_l)):
        j_mean = test_mean_l[i]
        j_std = test_std_l[i]
        conf_int = norm.interval(CONFIDENCE, loc=j_mean, scale=j_std / np.sqrt(len(test_data_m[i])))
        lower_conf.append(conf_int[0])
        upper_conf.append(conf_int[1])
        conf_delta.append(conf_int[1]-conf_int[0])

In [27]:
if FIND_SPLIT:
    fig = plt.figure(figsize=(36, 9), dpi=300)
    ax = fig.add_axes([0.05, 0.11, 0.90, 0.815])
    for i in range(X.shape[0]):
        ax.plot([x_idx[i], x_idx[i]], [upper_conf[i], lower_conf[i]], linestyle='-', color='grey', linewidth=1.4, alpha=0.5)
    sc = ax.scatter(x_idx, test_mean_l, alpha=1.0, marker='_', color='black', s=16)
    sc = ax.scatter(x_idx, true_y, alpha=1.0, marker='x', color='lawngreen', s=24)
    ax.set_xlabel('Sample ID', fontsize=17)
    ax.set_ylabel('Confidence Interval of Test Predictions', fontsize=17)
    plt.grid(which='major', color='#D5D5D5', alpha=0.5)
    plt.suptitle('XGBoost Confidence Interval of Test Predictions vs Sample ID of '+str(EPOCH)+' Rounds & Confidence='+str(CONFIDENCE), fontsize=21)
    save_name = 'XGBoost_02e_Test_Confidence_Intervals_'+c_time+'.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)

In [28]:
if FIND_SPLIT:
    fig = plt.figure(figsize=(10, 8), dpi=300)
    ax = fig.add_axes([0.11, 0.08, 0.88, 0.815])
    sc = ax.scatter(true_y, conf_delta, alpha=0.55, marker='o', color='#3CB371')
    ax2 = fig.add_axes([0.73, 0.67, 0.23, 0.20])
    ax.set_xlabel('True Values', fontsize=17)
    ax.set_ylabel('Size of Confidence Intervals', fontsize=17)
    ax.grid(which='major', color='#D5D5D5', alpha=0.5)
    plt.suptitle('XGBoost Scatter Plot of Confidence Intervals vs True Values\nConfidence='+str(CONFIDENCE)+'   Rounds='+str(EPOCH), fontsize=21)
    ax2.hist(conf_delta, bins=20, density=1, facecolor='#3CB371', edgecolor='#006400', alpha=0.75, linewidth=1.6)
    ax2.set_ylabel('Counts', fontsize=13)
    ax2.set_xlabel('Size of Confidence Intervals', fontsize=13)
    save_name = 'XGBoost_02f_Test_Size_Confidence_Intervals_'+c_time+'.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)

In [29]:
f_i_temp = f_i.copy()
# 计算相对重要度
[:, 0] = 100.0 * (f_i_temp[:, 0]/(max(f_i_temp[:, 0])-min(f_i_temp[:, 0])))
sorted_idx = np.argsort(-f_i_temp[:, 0])
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(11, 8), dpi=250)
ax = fig.add_axes([0.11, 0.08, 0.85, 0.84])
ba = ax.barh(pos[:10, ], f_i_temp[sorted_idx[:10, ], 0].flatten().tolist(), align='center', color='#CD5C5C', edgecolor='#A52A2A', linewidth=1.6)
plt.yticks(pos[:10, ], title[sorted_idx[:10, ]])
ax.set_xlabel('Relative Importance', fontsize=17)
plt.suptitle('Variable Importance (First 10) of XGBoost Regressor', fontsize=20)
save_name = 'XGBoost_04a_Feature_Importance_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)
save_name = 'XGBoost_04b_Sorted_Feature.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, sorted_idx.reshape(sorted_idx.shape[0], 1), fmt='%d')
feature_out = np.hstack((title.reshape(title.shape[0], 1), f_i_temp))
save_name = 'XGBoost_04c_All_Feature_Importance.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, feature_out, fmt='%s', delimiter=',')

SyntaxError: invalid syntax (<ipython-input-29-521382d4df22>, line 3)

In [None]:
mff_d = []
desc_d = []
intensive_d = []
conju_d = []
for i in range(title.shape[0]):
    t = title[i, ]
    temp = [t, str(f_i_temp[i, 0])]
    if t in mff_l:
        mff_d.append(temp)
    elif t in desc_l:
        desc_d.append(temp)
    elif t in conju_l:
        conju_d.append(temp)
    else:
        intensive_d.append(temp)
mff_m = np.array(mff_d).reshape(len(mff_d), 2)
desc_m = np.array(desc_d).reshape(len(desc_d), 2)
conju_m = np.array(conju_d).reshape(len(conju_d), 2)
intensive_m = np.array(intensive_d).reshape(len(intensive_d), 2)
save_name = 'XGBoost_05a_MFF_Featrues_'+c_time+'.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, mff_m, fmt='%s', delimiter=',', comments='!')
save_name = 'XGBoost_05b_Desc_Featrues_'+c_time+'.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, desc_m, fmt='%s', delimiter=',', comments='!')
save_name = 'XGBoost_05c_Conju_Features'+c_time+'.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, conju_m, fmt='%s', delimiter=',', comments='!')
save_name = 'XGBoost_05d_Intensive_Featrues_'+c_time+'.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, intensive_m, fmt='%s', delimiter=',', comments='!')

In [None]:
save_name = 'XGBoost_06_Performance_Record_'+c_time+'.csv'
save_name = Path('.', DIR, save_name)
m1 = np.array(mse_list).reshape(len(mse_list), 1)
m2 = np.array(mae_list).reshape(len(mae_list), 1)
m3 = np.array(r2_list).reshape(len(r2_list), 1)
m_out = np.vstack((np.array(['MSE', 'MAE', 'R^2']).reshape(1, 3), np.hstack((m1, np.hstack((m2, m3))))))
np.savetxt(save_name, m_out, fmt='%s', delimiter=',')

In [None]:
LOG_NAME = 'XGBoost_07_Log_'+c_time+'.txt'
LOG_NAME = Path('.', DIR, LOG_NAME)
f1 = open(LOG_NAME, 'w+')
f1.write('XGBoost Regressor\n\n')
f1.write('Total Epoch: '+str(EPOCH)+'\n\n')
f1.write('Dataset: '+INPUT_X+', '+INPUT_Y+'\n')
f1.write('Data Shape: '+str(X.shape)+', '+str(y.shape)+'\n\n')
f1.write('Mean MSE: '+str(np.mean(mse_list))+'\n')
f1.write('Min MSE: '+str(min(mse_list))+'\n\n')
f1.write('Mean MAE: '+str(np.mean(mae_list))+'%\n')
f1.write('Min MAE: '+str(min(mae_list))+'%\n\n')
f1.write('Mean R^2: '+str(np.mean(r2_list))+'\n')
f1.write('Max R^2: '+str(max(r2_list))+'\n\n')
f1.write('Feature Importance:\n')
for i in range(title.shape[0]):
    f1.write('name:'+str(title[sorted_idx[i, ], ])+'   value:'+str(f_i_temp[sorted_idx[i, ], 0])+'\n')
f1.close()

In [None]:
import openpyxl
XLSX_FILE = r'/home/jyb/dyh/Python/MachineLearningLog_DYH.xlsx'
data = openpyxl.load_workbook(XLSX_FILE)
table = data['Regressor']
nrows = table.max_row
if not FIND_SPLIT:
    SPLIT_STR = INPUT_SPLIT
else:
    SPLIT_STR = 'None'
out_excel = [c_time_m, 'Yiheng Dai', 'XGB-R '+Version, INPUT_X, INPUT_Y, INPUT_TITLE, SPLIT_STR,
             str(clf.get_params()), str(EPOCH), 'None', 'None', str(min(mse_list)), str(np.mean(mse_list)), 
             'None', 'None', str(max(r2_list)), str(np.mean(r2_list)), str(os.getcwd())+'/'+DIR, SAVE_NAME]
for i in range(len(out_excel)):
    table.cell(nrows+1,i+1).value = out_excel[i]
data.save(XLSX_FILE)