##### AFS-LF  
## 基于循环拟合的自动特征提取  
## Automatic Feature Selection by Loop-Fitting  
最后更新：2022.02.07 戴以恒  
当前版本：V2.5-Beta-SHAP  
### 简介：  
通过随机切分、重复拟合，结合特征重要性排名和协方差矩阵进行的包裹式特征提取和数据降维  
### 更新记录：  
V1.0：框架的搭建  
V1.1：改写了回归拟合的部分，将相同的回归器的运行放在了一起(没有性能提升)，改善了画图，修正了特征重要性的归一化方法  
V1.2-Alpha：GBRT尝试改为多核并行模式  
V1.3-Alpha：增加了最终表现的作图  
V1.4-Alpha：修改了排名的算法逻辑  
V1.5-Alpha：修改了作图表现，优化性能  
V2.0-Beta：实现了GBRT和XGBoost的双并行，可以极大地缩短拟合时间；并更新了最终表现的作图  
V2.1-Beta：减少了文件输出量  
V2.2-Beta-SHAP：修改了框架，用于使用SHAP输出特征重要性，并引入回归器系数用来调整权重  
V2.3-Beta-SHAP：优化了输入输出  
V2.4-Beta-SHAP：增加了50以内特征的作图  
V2.5-Beta-SHAP：增加输出SHAP分数的非绝对值、非归一化形式，用于直接考察原始特征重要性的影响  

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time
import gc

c_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
c_time_m = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

In [2]:
# 参数
# ======== System Setup ========
Version = 'V2.5-Beta-SHAP'
EPOCH = 240
REPEAT_ROUND = 1
CORE_NUM = 24             # 确保EPOCH*REPEAT_ROUND是CORE_NUM的整数倍
TRAIN_TEST_SPLIT = 0.85
REGRESSOR_COEF = [0.15, 0.30, 0.55]
# ======== Fit Data Input ========
S_N = 856
F_N = 94
INPUT_X = 'Features_'+str(S_N)+'_'+str(F_N)+'.csv'
INPUT_Y = 'Values_True_ln_'+str(S_N)+'.csv'
INPUT_TITLE = 'Title_'+str(F_N)+'.csv'
INPUT_SMILES = 'Smiles_'+str(S_N)+'.csv'
ONLY_FIT_ONCE = False
INPUT_FEATURE_COUNTS = 'Feature_Counts_'+str(F_N)+'.csv'
# ======== Data Output ========
RECORD_NAME = 'C01_Record_AFS_'+Version+'_'+c_time+'.txt'
LOG_NAME = 'C02_Log_AFS_'+Version+'_'+c_time+'.txt'
FIGURE_NAME_1 = 'MSE_Plot_AFS_'+Version
FIGURE_NAME_2 = 'R2_Plot_AFS_'+Version
FIGURE_NAME_3 = 'MAE_Plot_AFS_'+Version
FIGURE_NAME_4 = 'Performance_AFS_'+Version
FIGURE_NAME_5 = 'MSE_Distribution_AFS_'+Version
FIGURE_NAME_6 = 'R2_Distribution_AFS_'+Version
FIGURE_NAME_7 = 'MAE_Distribution_AFS_'+Version
FIGURE_NAME_8 = 'Covariance_Matrix_AFS_'+Version
FIGURE_NAME_9 = 'XGB_Performance_AFS_'+Version
TXT_NAME_1 = 'Feature_Name_AFS_'+Version
TXT_NAME_2 = 'Feature_Importance_AFS_'+Version
TXT_NAME_3 = 'Covariance_Matrix_AFS_'+Version
TXT_NAME_4 = 'Feature_Importance_Sorted_AFS_'+Version
TXT_NAME_5 = 'Real_Feature_Importance_Sorted_AFS_'+Version
DETAILED_OUTPUT = False

In [3]:
import shap
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn import model_selection
import joblib
from multiprocessing import Pool

In [4]:
import os
from pathlib import Path
DIR = 'AFS'+Version+'_'+c_time
os.mkdir(DIR)
RECORD_NAME = Path('.', DIR, RECORD_NAME)
f1 = open(RECORD_NAME, 'w')
f1.write('Record of AFS '+Version+'\n\n')
f1.write('Generation time: '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n\n\n')

39

In [5]:
f1.write('===Phase 1: Read Dataset===\n')
f1.write('Begin at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
X = np.loadtxt(INPUT_X, delimiter=',')
y = np.loadtxt(INPUT_Y)
title = np.loadtxt(INPUT_TITLE, dtype=str, delimiter=',', comments='!')
smiles = np.loadtxt(INPUT_SMILES, dtype=str, delimiter=',', comments='!')
if ONLY_FIT_ONCE:
    feature_c = [title.shape[0]]
else:
    feature_c = np.loadtxt(INPUT_FEATURE_COUNTS, dtype=int).flatten().tolist()
f1.write('Shape of dataset: '+str(X.shape)+', '+str(y.shape)+'\n')
f1.write('===Phase 1 done at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'===\n\n\n')

44

In [6]:
f1.write('===Phase 2: Create Regressors===\n')
f1.write('Begin at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
lasso = Lasso(alpha=3.0, max_iter=8000, tol=0.005, selection='random', precompute=False)
lasso_para = lasso.get_params()
gbrt =GradientBoostingRegressor(n_estimators=200, verbose=0, loss='ls', validation_fraction=0.15, n_iter_no_change=50, tol=0.00025, 
                                subsample=0.5, warm_start=False,learning_rate=0.045, min_impurity_decrease=0.003972950783280587, 
                                max_depth=9, max_features=0.2913910225812219, max_leaf_nodes=14)
gbrt_para = gbrt.get_params()
xgboost = XGBRegressor(n_estimators=150, learning_rate=0.025, max_depth=13, verbosity=0, booster='gbtree', 
                       reg_alpha=np.exp(-6.788644799030888), reg_lambda=np.exp(-7.450413274554533), gamma=np.exp(-5.374463422208394), 
                       subsample=0.5, objective= 'reg:squarederror', n_jobs=1)
xgb_para = xgboost.get_params()
f1.write('Params of Lasso:\n'+str(lasso_para)+'\n\n')
f1.write('Params of GBRT:\n'+str(gbrt_para)+'\n\n')
f1.write('Params of XGBoost Regressor:\n'+str(xgb_para)+'\n\n')
f1.write('===Phase 2 done at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'===\n\n\n')

44

In [7]:
def draw_distribution(m_in, x_label, title_in, save_name):
    global DETAILED_OUTPUT
    fig = plt.figure(figsize=(10, 8), dpi=250)
    ax = fig.add_axes([0.10, 0.10, 0.84, 0.80])
    ax.hist(m_in[:, 0], bins=40, density=False, facecolor='#4682B4', edgecolor='#505050', alpha=0.4)
    ax.hist(m_in[:, 1], bins=40, density=False, facecolor='#3CB371', edgecolor='#006400', alpha=0.4)
    ax.hist(m_in[:, 2], bins=40, density=False, facecolor='#FF6347', edgecolor='#FF4500', alpha=0.4)
    ax.set_xlabel(x_label, fontsize=17)
    ax.set_ylabel('Times', fontsize=17)
    plt.legend(['Lasso', 'GBRT', 'XGBoost'], loc='upper left', fontsize=16)
    plt.suptitle(title_in+'\nLASSO:'+str(round(np.mean(m_in[:, 0]), 4))+'     GBRT:'+str(round(np.mean(m_in[:, 1]), 4))+
                 '     XGBoost:'+str(round(np.mean(m_in[:, 2]), 4)), fontsize=19)
    plt.savefig(save_name)
    fig.clf()
    plt.close()

In [8]:
def draw_cov(m_in, title_in, save_name, save_name2):
    global DETAILED_OUTPUT
    m = m_in.copy()
    for i in range(m.shape[1]):
        m[:, i] = (m[:, i]-np.mean(m[:, i]))/np.std(m[:, i])
    cov_m = np.cov(m, rowvar=False)
    cov_m -= np.eye(cov_m.shape[0])
    if DETAILED_OUTPUT:
        np.savetxt(save_name2, cov_m, fmt='%s', delimiter=',')
    fig = plt.figure(figsize=(10, 8), dpi=250)
    ax = fig.add_axes([0.05, 0.08, 0.90, 0.86])
    im = ax.imshow(cov_m, cmap='plasma_r', origin='lower', vmin=-1.0, vmax=1.0)
    ax.set_xlabel('Features', fontsize=15)
    ax.set_ylabel('Features', fontsize=15)
    plt.suptitle(title_in, fontsize=20)
    plt.colorbar(im)
    plt.savefig(save_name)
    fig.clf()
    plt.close()

In [9]:
def LASSO_SHAP(X, clf_new):
    shap_values = shap.Explainer(clf_new).shap_values(X)
    f_i = np.mean(np.abs(shap_values), axis=0).flatten().tolist()
    return f_i

In [10]:
def GBRT_Fit(X, y, X_train, y_train, X_test, y_test, paras):
    clf_new = GradientBoostingRegressor()
    for k, v in paras.items():
        clf_new.set_params(**{k: v})
    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    # 拟合模型
    clf_new.fit(X_train, y_train)
    # 计算损失
    y_pred = clf_new.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    shap_values = shap.Explainer(clf_new).shap_values(X)
    f_i = np.mean(np.abs(shap_values), axis=0).flatten().tolist()
    f_i_o = np.mean(shap_values, axis=0).flatten().tolist()
    # f_i = clf_new.feature_importances_
    temp = [mse, mae, r2, f_i, f_i_o]
    del y_pred, shap_values
    return (temp, 'None')

In [11]:
def XGB_Fit(X, y, X_train, y_train, X_test, y_test, paras):
    clf_new = XGBRegressor()
    for k, v in paras.items():
        clf_new.set_params(**{k: v})
    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    # 拟合模型
    clf_new.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=150, verbose=False)
    # 计算损失
    y_pred = clf_new.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    shap_values = shap.Explainer(clf_new).shap_values(X)
    f_i = np.mean(np.abs(shap_values), axis=0).flatten().tolist()
    f_i_o = np.mean(shap_values, axis=0).flatten().tolist()
    # f_i = clf_new.feature_importances_
    temp = [mse, mae, r2, f_i, f_i_o]
    del y_pred, shap_values
    return (temp, 'None')

In [12]:
f1.write('===Phase 3: Fit and Feature Selection===\n')
f1.write('Begin at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
# 初始化最终绘图使用的分数记录矩阵
final_mse_m = np.zeros((len(feature_c), 4))
final_mae_m = np.zeros((len(feature_c), 4))
final_r2_m = np.zeros((len(feature_c), 4))
# 初始化排序后的特征序号
sort_features = np.linspace(0, title.shape[0]-1, title.shape[0]).astype(int).flatten().tolist()
for _ in range(len(feature_c)):
    f_c_num = feature_c[_]
    print(_+1, 'Epoch start with', f_c_num, 'features')
    f1.write('Generation '+str(_+1).zfill(2)+' Begin with '+str(f_c_num)+' Features\n\tat '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
    s_l = sort_features[:f_c_num]
    X = X[:, s_l]
    point = round(X.shape[0]*TRAIN_TEST_SPLIT)
    save_name = 'A'+str(_+1).zfill(2)+'_01_'+FIGURE_NAME_8+'_'+str(f_c_num)+'_Features.png'
    save_name = Path('.', DIR, save_name)
    save_name2 = 'A'+str(_+1).zfill(2)+'_02_'+TXT_NAME_3+'_'+str(f_c_num)+'_Features.csv'
    save_name2 = Path('.', DIR, save_name2)
    draw_cov(X, 'Covariance Matrix of '+str(f_c_num)+' Features', save_name, save_name2)
    title = title[s_l, ]
    save_name = 'A'+str(_+1).zfill(2)+'_03_'+TXT_NAME_1+'_'+str(f_c_num)+'_Features.csv'
    save_name = Path('.', DIR, save_name)
    np.savetxt(save_name, title.reshape(title.shape[0], 1), fmt='%s', delimiter=',')
    # Lasso使用另一个X
    X_L = X.copy()
    for i in range(X.shape[1]):
        X_L[:, i] = (X_L[:, i]-min(X_L[:, i]))/(max(X_L[:, i])-min(X_L[:, i]))*1000
    # 初始化各个回归器的特征系数矩阵
    lasso_co = np.zeros((title.shape[0], 1))
    gbrt_f_i = np.zeros((title.shape[0], 1))
    xgb_f_i = np.zeros((title.shape[0], 1))
    # 初始化各个回归器的原始特征系数矩阵
    lasso_co_o = np.zeros((title.shape[0], 1))
    gbrt_f_i_o = np.zeros((title.shape[0], 1))
    xgb_f_i_o = np.zeros((title.shape[0], 1))
    # 初始化数据的记录矩阵
    mse_m = np.zeros((EPOCH*REPEAT_ROUND, 3))
    mae_m = np.zeros((EPOCH*REPEAT_ROUND, 3))
    r2_m = np.zeros((EPOCH*REPEAT_ROUND, 3))
    # 多轮的训练、拟合
    # LASSO
    index = -1
    for i in range(EPOCH):
        permutation = np.random.permutation(y.shape[0])
        train_idx = permutation[:point]
        test_idx = permutation[point:]
        X_train = X[train_idx, :]
        y_train = y[train_idx]
        X_test = X[test_idx, :]
        y_test = y[test_idx]
        X_L_train = X_L[train_idx, :]
        X_L_test = X_L[test_idx, :]
        for j in range(REPEAT_ROUND):
            index += 1
            if j!=0:
                perm_train = np.random.permutation(X_train.shape[0])
                X_train = X_train[perm_train, :]
                X_L_train = X_L_train[perm_train, :]
                y_train = y_train[perm_train]
                perm_test = np.random.permutation(X_test.shape[0])
                X_test = X_test[perm_test, :]
                X_L_test = X_L_test[perm_test, :]
                y_test = y_test[perm_test]
            # Lasso的回归拟合
            lasso_new = Lasso()
            for k, v in lasso_para.items():
                lasso_new.set_params(**{k: v})
            lasso_new.fit(X_L_train, y_train)
            y_pred = lasso_new.predict(X_L_test)
            mse_m[index, 0] = mean_squared_error(y_test, y_pred)
            mae_m[index, 0] = mean_absolute_error(y_test, y_pred)
            r2_m[index, 0] = r2_score(y_test, y_pred)
            lasso_co = lasso_co+(lasso_new.coef_.reshape(title.shape[0], 1)*r2_m[index, 0])
            lasso_co_o = lasso_co_o+(lasso_new.coef_.reshape(title.shape[0], 1)*r2_m[index, 0])
            del y_pred
    # GBRT
    r_l = []
    for i in range(int(EPOCH*REPEAT_ROUND/CORE_NUM)):
        pool = Pool(CORE_NUM)
        for j in range(CORE_NUM):
            permutation = np.random.permutation(y.shape[0])
            train_idx = permutation[:point]
            test_idx = permutation[point:]
            X_train = X[train_idx, :]
            y_train = y[train_idx]
            X_test = X[test_idx, :]
            y_test = y[test_idx]
            r = pool.apply_async(GBRT_Fit, args=(X, y, X_train, y_train, X_test, y_test, gbrt_para,))
            r_l.append(r)
        pool.close()
        pool.join()
    for index in range(len(r_l)):
        r = r_l[index]
        results = r.get()
        temp = results[0]
        mse_m[index, 1] = temp[0]
        mae_m[index, 1] = temp[1]
        r2_m[index, 1] = temp[2]
        feature_importance = np.array(temp[3]).reshape(title.shape[0], 1)
        gbrt_f_i = gbrt_f_i+feature_importance*r2_m[index, 1]
        feature_importance_o = np.array(temp[4]).reshape(title.shape[0], 1)
        gbrt_f_i_o = gbrt_f_i_o+feature_importance_o*r2_m[index, 1]
        del results, temp, feature_importance, feature_importance_o
    # XGB
    r_l = []
    for i in range(int(EPOCH*REPEAT_ROUND/CORE_NUM)):
        pool = Pool(CORE_NUM)
        for j in range(CORE_NUM):
            permutation = np.random.permutation(y.shape[0])
            train_idx = permutation[:point]
            test_idx = permutation[point:]
            X_train = X[train_idx, :]
            y_train = y[train_idx]
            X_test = X[test_idx, :]
            y_test = y[test_idx]
            r = pool.apply_async(XGB_Fit, args=(X, y, X_train, y_train, X_test, y_test, xgb_para,))
            r_l.append(r)
        pool.close()
        pool.join()
    for index in range(len(r_l)):
        r = r_l[index]
        results = r.get()
        temp = results[0]
        mse_m[index, 2] = temp[0]
        mae_m[index, 2] = temp[1]
        r2_m[index, 2] = temp[2]
        feature_importance = np.array(temp[3]).reshape(title.shape[0], 1)
        xgb_f_i = xgb_f_i + feature_importance*r2_m[index, 2]
        feature_importance_o = np.array(temp[4]).reshape(title.shape[0], 1)
        xgb_f_i_o = xgb_f_i_o + feature_importance_o*r2_m[index, 2]
        del results, temp, feature_importance, feature_importance_o
    # 数据记录
    for i in range(3):
        final_mse_m[_, i] = np.mean(mse_m[:, i])
        final_mae_m[_, i] = np.mean(mae_m[:, i])
        final_r2_m[_, i] = np.mean(r2_m[:, i])
    final_mse_m[_, 3] = (final_mse_m[_, 0]+final_mse_m[_, 1]+final_mse_m[_, 2])/3
    final_mae_m[_, 3] = (final_mae_m[_, 0]+final_mae_m[_, 1]+final_mae_m[_, 2])/3
    final_r2_m[_, 3] = (final_r2_m[_, 0]+final_r2_m[_, 1]+final_r2_m[_, 2])/3
    f1.write('MSE:'+str(final_mse_m[_, :])+'\n')
    f1.write('MAE:'+str(final_mae_m[_, :])+'\n')
    f1.write('R^2:'+str(final_r2_m[_, :])+'\n')
    # 输出数据分布：
    if DETAILED_OUTPUT:
        save_name = 'A'+str(_+1).zfill(2)+'_04_'+FIGURE_NAME_5+'_'+str(f_c_num)+'_Features.png'
        save_name = Path('.', DIR, save_name)
        draw_distribution(mse_m, 'MSE', 'MSE Distribution of '+str(f_c_num)+' Features', save_name)
        save_name = 'A'+str(_+1).zfill(2)+'_05_'+FIGURE_NAME_7+'_'+str(f_c_num)+'_Features.png'
        save_name = Path('.', DIR, save_name)
        draw_distribution(mae_m, 'MAE', 'MAE Distribution of '+str(f_c_num)+' Features', save_name)
    save_name = 'A'+str(_+1).zfill(2)+'_06_'+FIGURE_NAME_6+'_'+str(f_c_num)+'_Features.png'
    save_name = Path('.', DIR, save_name)
    draw_distribution(r2_m, 'R^2', 'R^2 Distribution of '+str(f_c_num)+' Features', save_name)
    # 特征排列与提取
    # 特征归一化
    lasso_co[:, 0] = 100.0 * (lasso_co[:, 0]/max(abs(lasso_co[:, 0])))
    gbrt_f_i[:, 0] = 100.0 * (gbrt_f_i[:, 0]/max(gbrt_f_i[:, 0]))
    xgb_f_i[:, 0] = 100.0 * (xgb_f_i[:, 0]/max(xgb_f_i[:, 0]))
    # 输出特征数据
    f_out = np.hstack((title.reshape(title.shape[0], 1), np.hstack((lasso_co, np.hstack((gbrt_f_i, xgb_f_i))))))
    lasso_co_o = lasso_co_o / np.sum(r2_m[:, 0])
    gbrt_f_i_o = gbrt_f_i_o / np.sum(r2_m[:, 1])
    xgb_f_i_o = xgb_f_i_o / np.sum(r2_m[:, 2])
    f_o_out = np.hstack((title.reshape(title.shape[0], 1), np.hstack((lasso_co_o*1000, np.hstack((gbrt_f_i_o, xgb_f_i_o))))))
    # f_out = np.vstack((np.array(['Feature', 'Lasso', 'GBRT', 'XGBoost']).reshape(1, 4), f_out))
    if DETAILED_OUTPUT:
        save_name = 'A'+str(_+1).zfill(2)+'_07_'+TXT_NAME_2+'_'+str(f_c_num)+'_Features.csv'
        save_name = Path('.', DIR, save_name)
        np.savetxt(save_name, np.vstack((np.array(['Feature', 'Lasso', 'GBRT', 'XGBoost']).reshape(1, 4), f_out)), fmt='%s', delimiter=',')
    # 特征排序
    f_i_temp = np.zeros((title.shape[0], 4))
    permu_1 = np.argsort(-np.abs(lasso_co).reshape(title.shape[0], ))
    permu_2 = np.argsort(-gbrt_f_i.reshape(title.shape[0], ))
    permu_3 = np.argsort(-xgb_f_i.reshape(title.shape[0], ))
    for i in range(title.shape[0]):
        f_i_temp[permu_1[i], 0] = i+1
        f_i_temp[permu_2[i], 1] = i+1
        f_i_temp[permu_3[i], 2] = i+1
    m1 = np.mean(r2_m[:, 0])
    m2 = np.mean(r2_m[:, 1])
    m3 = np.mean(r2_m[:, 2])
    for i in range(title.shape[0]):
        f_i_temp[i, 3] = (f_i_temp[i, 0]*(m1*REGRESSOR_COEF[0])+
                          f_i_temp[i, 1]*(m2*REGRESSOR_COEF[1])+
                          f_i_temp[i, 2]*(m3*REGRESSOR_COEF[2]))/(m1*REGRESSOR_COEF[0]+m2*REGRESSOR_COEF[1]+m3*REGRESSOR_COEF[2])
    sort_features = np.argsort(f_i_temp[:, 3]).flatten().tolist()
    f_out = f_out[sort_features, :]
    f_out = np.hstack((f_out, f_i_temp[sort_features, :]))
    save_name = 'A'+str(_+1).zfill(2)+'_08_'+TXT_NAME_4+'_'+str(f_c_num)+'_Features.csv'
    save_name = Path('.', DIR, save_name)
    np.savetxt(save_name, np.vstack((np.array(['Feature', 'Lasso', 'GBRT', 'XGBoost', 'Lasso Rank', 'GBRT Rank', 'XGBoost Rank', 'Rank']).reshape(1, 8), 
                                     f_out)), fmt='%s', delimiter=',')
    f_o_out = f_o_out[sort_features, :]
    f_o_out = np.hstack((f_o_out, f_i_temp[sort_features, :]))
    save_name = 'A'+str(_+1).zfill(2)+'_09_'+TXT_NAME_5+'_'+str(f_c_num)+'_Features.csv'
    save_name = Path('.', DIR, save_name)
    np.savetxt(save_name, np.vstack((np.array(['Feature', 'Lasso', 'GBRT', 'XGBoost', 'Lasso Rank', 'GBRT Rank', 'XGBoost Rank', 'Rank']).reshape(1, 8), 
                                     f_o_out)), fmt='%s', delimiter=',')
    
    gc.collect()
    f1.write('Generation '+str(_+1).zfill(2)+' Done with '+str(f_c_num)+' Features\n\tat '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n\n')
f1.write('===Phase 3 done at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'===\n\n\n')

1 Epoch start with 94 features
2 Epoch start with 90 features
3 Epoch start with 85 features
4 Epoch start with 80 features
5 Epoch start with 75 features
6 Epoch start with 70 features
7 Epoch start with 68 features
8 Epoch start with 65 features
9 Epoch start with 62 features
10 Epoch start with 59 features
11 Epoch start with 56 features
12 Epoch start with 54 features
13 Epoch start with 52 features
14 Epoch start with 50 features
15 Epoch start with 48 features
16 Epoch start with 46 features
17 Epoch start with 44 features
18 Epoch start with 42 features
19 Epoch start with 40 features
20 Epoch start with 38 features
21 Epoch start with 36 features
22 Epoch start with 34 features
23 Epoch start with 32 features
24 Epoch start with 30 features
25 Epoch start with 28 features
26 Epoch start with 26 features
27 Epoch start with 24 features


KeyboardInterrupt: 

In [None]:
f1.write('===Phase 4: Draw Full Plots===\n')
f1.write('Begin at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')

x_idx = feature_c
fig = plt.figure(figsize=(10, 8), dpi=250)
ax = fig.add_axes([0.10, 0.10, 0.82, 0.82])
ax.plot(x_idx, final_mse_m[:, 0].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
ax.plot(x_idx, final_mse_m[:, 1].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
ax.plot(x_idx, final_mse_m[:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
ax.plot(x_idx, final_mse_m[:, 3].flatten().tolist(), color="#FAA460", linewidth=3, linestyle=':', marker='*', zorder=10)
ax.set_xlabel('Number of Features', fontsize=17)
ax.set_ylabel('MSE', fontsize=17)
ax.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
plt.legend(['Lasso', 'GBRT', 'XGBoost', 'mean'], loc='lower right', fontsize=16)
plt.suptitle('MSE - Feature Counts Plot', fontsize=21)
save_name = 'B01a_'+FIGURE_NAME_1+'_'+str(len(feature_c))+'_Gens.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)
plt.close()

fig = plt.figure(figsize=(10, 8), dpi=250)
ax = fig.add_axes([0.10, 0.10, 0.82, 0.82])
ax.plot(x_idx, final_mae_m[:, 0].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
ax.plot(x_idx, final_mae_m[:, 1].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
ax.plot(x_idx, final_mae_m[:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
ax.plot(x_idx, final_mae_m[:, 3].flatten().tolist(), color="#FAA460", linewidth=3, linestyle=':', marker='*', zorder=10)
ax.set_xlabel('Number of Features', fontsize=17)
ax.set_ylabel('MAE', fontsize=17)
ax.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
plt.legend(['Lasso', 'GBRT', 'XGBoost', 'mean'], loc='lower right', fontsize=16)
plt.suptitle('MAE - Feature Counts Plot', fontsize=21)
save_name = 'B01b_'+FIGURE_NAME_3+'_'+str(len(feature_c))+'_Gens.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)
plt.close()

fig = plt.figure(figsize=(10, 8), dpi=250)
ax = fig.add_axes([0.10, 0.15, 0.80, 0.78])
ax.plot(x_idx, final_r2_m[:, 0].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
ax.plot(x_idx, final_r2_m[:, 1].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
ax.plot(x_idx, final_r2_m[:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
ax.plot(x_idx, final_r2_m[:, 3].flatten().tolist(), color="#FAA460", linewidth=3, linestyle=':', marker='*', zorder=10)
ax.set_xlabel('Number of Features', fontsize=17)
ax.set_ylabel('R^2', fontsize=17)
ax.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
plt.legend(['Lasso', 'GBRT', 'XGBoost', 'mean'], loc='lower right', fontsize=16)
plt.suptitle('R^2 - Feature Counts Plot', fontsize=21)
save_name = 'B01c_'+FIGURE_NAME_2+'_'+str(len(feature_c))+'_Gens.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)
plt.close()

fig = plt.figure(figsize=(10, 8), dpi=250)
ax1 = fig.add_axes([0.10, 0.07, 0.80, 0.25])
ax2 = fig.add_axes([0.10, 0.38, 0.80, 0.25])
ax3 = fig.add_axes([0.10, 0.69, 0.80, 0.25])
ax1.plot(x_idx, final_mse_m[:, 3].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
ax2.plot(x_idx, final_mae_m[:, 3].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
ax3.plot(x_idx, final_r2_m[:, 3].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
ax1.set_xlabel('Number of Features', fontsize=15)
ax1.set_ylabel('MSE', fontsize=15)
ax2.set_ylabel('MAE', fontsize=15)
ax3.set_ylabel('R^2', fontsize=15)
ax1.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
ax2.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
ax3.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
plt.suptitle('AFS '+Version+' Performance Plot', fontsize=19)
save_name = 'B01d_'+FIGURE_NAME_4+'_'+str(len(feature_c))+'_Gens.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)
plt.close()

fig = plt.figure(figsize=(10, 8), dpi=250)
ax1 = fig.add_axes([0.10, 0.07, 0.80, 0.25])
ax2 = fig.add_axes([0.10, 0.38, 0.80, 0.25])
ax3 = fig.add_axes([0.10, 0.69, 0.80, 0.25])
ax1.plot(x_idx, final_mse_m[:, 2].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
ax2.plot(x_idx, final_mae_m[:, 2].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
ax3.plot(x_idx, final_r2_m[:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
ax1.set_xlabel('Number of Features', fontsize=15)
ax1.set_ylabel('MSE', fontsize=15)
ax2.set_ylabel('MAE', fontsize=15)
ax3.set_ylabel('R^2', fontsize=15)
ax1.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
ax2.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
ax3.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
plt.suptitle('AFS '+Version+' XGB Performance Plot', fontsize=19)
save_name = 'B02_'+FIGURE_NAME_9+'_'+str(len(feature_c))+'_Gens.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)
plt.close()

f1.write('===Phase 4 done at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'===\n\n\n')

In [None]:
if feature_c[0] >= 50:
    f1.write('===Phase 5: Draw Last Few Points===\n')
    f1.write('Begin at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
    for i in range(len(feature_c)):
        if feature_c[i] <= 50:
            bp = i
            break
    
    fig = plt.figure(figsize=(10, 8), dpi=250)
    ax = fig.add_axes([0.10, 0.10, 0.82, 0.82])
    ax.plot(x_idx[bp:], final_mse_m[bp:, 0].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
    ax.plot(x_idx[bp:], final_mse_m[bp:, 1].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
    ax.plot(x_idx[bp:], final_mse_m[bp:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
    ax.plot(x_idx[bp:], final_mse_m[bp:, 3].flatten().tolist(), color="#FAA460", linewidth=3, linestyle=':', marker='*', zorder=10)
    ax.set_xlabel('Number of Features', fontsize=17)
    ax.set_ylabel('MSE', fontsize=17)
    ax.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    plt.legend(['LASSO', 'GBRT', 'XGB', 'mean'], loc='lower right', fontsize=16)
    plt.suptitle('MSE - Feature Counts Plot', fontsize=21)
    save_name = 'B03a_'+FIGURE_NAME_1+'_Last_'+str(len(feature_c[bp:]))+'_Gens.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)
    plt.close()
    
    fig = plt.figure(figsize=(10, 8), dpi=250)
    ax = fig.add_axes([0.10, 0.10, 0.82, 0.82])
    ax.plot(x_idx[bp:], final_mae_m[bp:, 0].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
    ax.plot(x_idx[bp:], final_mae_m[bp:, 1].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
    ax.plot(x_idx[bp:], final_mae_m[bp:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
    ax.plot(x_idx[bp:], final_mae_m[bp:, 3].flatten().tolist(), color="#FAA460", linewidth=3, linestyle=':', marker='*', zorder=10)
    ax.set_xlabel('Number of Features', fontsize=17)
    ax.set_ylabel('MAE', fontsize=17)
    ax.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    plt.legend(['LASSO', 'GBRT', 'XGB', 'mean'], loc='lower right', fontsize=16)
    plt.suptitle('MAE - Feature Counts Plot', fontsize=21)
    save_name = 'B03b_'+FIGURE_NAME_3+'_Last_'+str(len(feature_c[bp:]))+'_Gens.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)
    plt.close()
    
    fig = plt.figure(figsize=(10, 8), dpi=250)
    ax = fig.add_axes([0.10, 0.15, 0.80, 0.78])
    ax.plot(x_idx[bp:], final_r2_m[bp:, 0].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
    ax.plot(x_idx[bp:], final_r2_m[bp:, 1].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
    ax.plot(x_idx[bp:], final_r2_m[bp:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
    ax.plot(x_idx[bp:], final_r2_m[bp:, 3].flatten().tolist(), color="#FAA460", linewidth=3, linestyle=':', marker='*', zorder=10)
    ax.set_xlabel('Number of Features', fontsize=17)
    ax.set_ylabel('R^2', fontsize=17)
    ax.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    plt.legend(['Lasso', 'GBRT', 'XGBoost', 'mean'], loc='lower right', fontsize=16)
    plt.suptitle('R^2 - Feature Counts Plot', fontsize=21)
    save_name = 'B03c_'+FIGURE_NAME_2+'_Last_'+str(len(feature_c[bp:]))+'_Gens.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)
    plt.close()
    
    fig = plt.figure(figsize=(10, 8), dpi=250)
    ax1 = fig.add_axes([0.10, 0.07, 0.80, 0.25])
    ax2 = fig.add_axes([0.10, 0.38, 0.80, 0.25])
    ax3 = fig.add_axes([0.10, 0.69, 0.80, 0.25])
    ax1.plot(x_idx[bp:], final_mse_m[bp:, 3].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
    ax2.plot(x_idx[bp:], final_mae_m[bp:, 3].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
    ax3.plot(x_idx[bp:], final_r2_m[bp:, 3].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
    ax1.set_xlabel('Number of Features', fontsize=15)
    ax1.set_ylabel('MSE', fontsize=15)
    ax2.set_ylabel('MAE', fontsize=15)
    ax3.set_ylabel('R^2', fontsize=15)
    ax1.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    ax2.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    ax3.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    plt.suptitle('AFS '+Version+' Performance Plot', fontsize=19)
    save_name = 'B03d_'+FIGURE_NAME_4+'_Last_'+str(len(feature_c[bp:]))+'_Gens.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)
    plt.close()
    
    fig = plt.figure(figsize=(10, 8), dpi=250)
    ax1 = fig.add_axes([0.10, 0.07, 0.80, 0.25])
    ax2 = fig.add_axes([0.10, 0.38, 0.80, 0.25])
    ax3 = fig.add_axes([0.10, 0.69, 0.80, 0.25])
    ax1.plot(x_idx[bp:], final_mse_m[bp:, 2].flatten().tolist(), color="#F08080", linewidth=3, linestyle=':', marker='o', zorder=10)
    ax2.plot(x_idx[bp:], final_mae_m[bp:, 2].flatten().tolist(), color="#00CED1", linewidth=3, linestyle=':', marker='s', zorder=10)
    ax3.plot(x_idx[bp:], final_r2_m[bp:, 2].flatten().tolist(), color="#9ACD32", linewidth=3, linestyle=':', marker='^', zorder=10)
    ax1.set_xlabel('Number of Features', fontsize=15)
    ax1.set_ylabel('MSE', fontsize=15)
    ax2.set_ylabel('MAE', fontsize=15)
    ax3.set_ylabel('R^2', fontsize=15)
    ax1.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    ax2.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    ax3.grid(which='major', color='#D5D5D5', alpha=0.5, zorder=1)
    plt.suptitle('AFS '+Version+' XGB Performance Plot', fontsize=19)
    save_name = 'B02_'+FIGURE_NAME_9+'_Last_'+str(len(feature_c[bp:]))+'_Gens.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name)
    plt.close()
    
    f1.write('===Phase 5 done at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'===\n\n\n')

In [None]:
f1.write('\n\n   AFS-LF Done Normally at '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n\n\n')
f1.close()
LOG_NAME = Path('.', DIR, LOG_NAME)
f2 = open(LOG_NAME, 'w')
f2.write('Log of AFS-LF '+Version+'\n')
f2.write('Log generation time: '+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n\n')
f2.write('Input data:\n')
f2.write('Feature Matrix: '+INPUT_X+'\n')
f2.write('Label Values: '+INPUT_Y+'\n')
f2.write('Smiles List: '+INPUT_SMILES+'\n')
f2.write('Title List: '+INPUT_TITLE+'\n')
f2.write('Feature Count List: '+INPUT_FEATURE_COUNTS+'\n\n')
f2.write('Parameters:\n')
f2.write('Total Generation: '+str(len(feature_c))+'\n')
f2.write('Epoch of every Generation: '+str(EPOCH)+'\n')
f2.write('Round of every Epoch: '+str(REPEAT_ROUND)+'\n')
f2.write('Split Ratio of Training and Testing Sets: '+str(TRAIN_TEST_SPLIT)+'\n')
f2.write('Accumulating Coef of Regressors: LASSO: '+str(REGRESSOR_COEF[0])+' GBRT:'+str(REGRESSOR_COEF[1])+' XGB:'+str(REGRESSOR_COEF[2])+'\n')
f2.write('Params of Regressors:\n')
f2.write('Params of Lasso:\n'+str(lasso_para)+'\n\n')
f2.write('Params of GBRT:\n'+str(gbrt_para)+'\n\n')
f2.write('Params of XGBoost Regressor:\n'+str(xgb_para)+'\n\n')
f2.close()