In [1]:
from collections import defaultdict
from pprint import pprint
import time
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# matplotlib config
pd.options.display.max_rows = 10  
%matplotlib inline

# ML
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.model_selection import train_test_split
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)  # 10cv shuffle 
from sklearn.externals import joblib  # save model
from sklearn.feature_selection import RFE,RFECV  # fs
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

from tqdm import tqdm, trange
with open("./config.json") as f:
    config = json.loads(f.read())
    

# import data
from balance_date import BalanceDate
# custom tool
import utils



配置文件
{'BTK': './out/BTK.faste',
 'aaindex': './data/aaindexmatrix.txt',
 'all_feature': './data/all_feature.csv',
 'feature_kind': './data/features_kind.xls',
 'feature_selected': {'LightGBM_rfe_100': './out/20210203/10cv_3class_feature_select_LightGBM_direct_100.rfe',
                      'LightGBM_rfe_20': './out/20210203/10cv_3class_feature_select_LightGBM_direct_20.rfe',
                      'LightGBM_rfe_30': './out/20210108/10cv_3class_feature_select_LightGBM_30.rfe',
                      'LightGBM_rfe_50': './out/20210203/10cv_3class_feature_select_LightGBM_direct_50.rfe',
                      'RandomForeast': './out/20201224/10cv_3class_feature_select_RandomForeast.rfecv',
                      'RandomForeast_rfe_30': './out/20201226/10cv_3class_feature_select_RandomForeast_30.rfe',
                      'XGBoost': './out/20201224/10cv_3class_feature_select_XGBoost.rfecv',
                      'XGBoost_rfe_30': './out/20201226/10cv_3class_feature_select_XGBoost_30.rfe',
  

In [2]:
FEATURE = BalanceDate.get_feature()  
OUT_PATH = os.path.join(".", "out", time.strftime("%Y%m%d")) 
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)
print("features：", FEATURE)
print("out path：", OUT_PATH)

features： 0             sift
1             hits
2       ANDN920101
3       ARGP820101
4       ARGP820102
           ...    
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
1080         NegAA
Length: 1081, dtype: object
out path： .\out\20210525


# read data

In [3]:
cv_kind = 1
train_set_kind = 1

X_test1, y_test1, X_test2, y_test2 = BalanceDate.get_test_values()  
X_train, y_train, g_train = BalanceDate.get_train_values(train_set_kind)  
cvs = BalanceDate.split_cv(kind=cv_kind, train_set_kind=train_set_kind)
cvs_method = BalanceDate.split_cv(kind=cv_kind, train_set_kind=train_set_kind, return_kind=2)

print("test1")
utils.solubility_distribute(y_test1, 0, 1)
print("test2")
utils.solubility_distribute(y_test2, 0, 1)
print("train")
utils.solubility_distribute(y_train, 0, 1)

划分测试/训练集方法——1. 按照原始比例
划分测试/训练集方法——1. 按照原始比例
划分10cv
训练集数目: (5666, 1091)
划分cv方法—— 1: 默认 随机取，每种（+ - =）取原比例的
返回所有验证集
        variations decrease no-change increase
train1         567      280       193       94
train2         567      280       193       94
train3         567      280       193       94
train4         567      280       193       94
train5         567      280       193       94
train6         567      280       193       94
train7         566      280       193       93
train8         566      280       192       94
train9         566      279       193       94
train10        566      279       193       94
划分测试/训练集方法——1. 按照原始比例
划分10cv
训练集数目: (5666, 1091)
划分cv方法—— 1: 默认 随机取，每种（+ - =）取原比例的
返回使用的cv方法对象
test1
-1: 0: 1 = 12: 22: 12= 1.00: 1.83: 1.00
test2
-1: 0: 1 = 338: 237: 87= 1.00: 0.70: 0.26
train
-1: 0: 1 = 2798: 1929: 939= 1.00: 0.69: 0.34


'-1: 0: 1 = 2798: 1929: 939= 1.00: 0.69: 0.34'

# read model

In [4]:
name = "lightGBM"
model =  lgb.LGBMClassifier(random_state=0)  # gbm     
print(name, model)

lightGBM LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


# direct

In [5]:
res_cv = []  # cv 
res_blind = []  # blind 
res_names = []  # name

In [6]:
res_names.append("direct")
print("10cv")
res_cv.append( utils.CVUtil(model, name + "direct", cvs_method).set_data(X_train, y_train, g_train).fit())
print("blind")
res_blind.append( utils.BlindTestUtil(model, name + "direct").set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit() )

10cv
原数据特征数目： 1081
实际数据特征数目： 1081
lightGBMdirect: 进行交叉验证
特征数目： 1081
train数据分布
-1    2798
 0    1929
 1     939
dtype: int64
原有数据数目：2798 : 1929 : 939
------cv 1------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 2------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 3------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 4------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 5------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 6------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 7------
cv test ：2518 : 1736 : 846
cv test ：280 : 193 : 93
------cv 8------
cv test ：2518 : 1737 : 845
cv test ：280 : 192 : 94
------cv 9------
cv test ：2519 : 1736 : 845
cv test ：279 : 193 : 94
------cv 10------
cv test ：2519 : 1736 : 845
cv test ：279 : 193 : 94
blind
原数据特征数目： 1081
实际特征数目： 1081
特征数目： 1081
train数据分布
-1    2798
 0    1929
 1     939
dtype: int64
test1数据分布
 0    22
 1    12
-1    12
dtype: int64
test2数据分布
-1    338


# after fs

## rfe

In [8]:
# 特征筛选的信息
file_name = config["feature_selected"]["LightGBM_rfe_30"]
rfecv = joblib.load(file_name)
print("dir:", file_name)
print("count:", rfecv.n_features_)
print("feature:")
for i, j in enumerate(FEATURE[rfecv.support_]):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

res_names.append("direct fs_rfe")
print("10cv")
res_cv.append(utils.CVUtil(model, name + "fs", cvs_method, feature_select=rfecv).set_data(X_train, y_train, g_train).fit())
print("blind")
res_blind.append( utils.BlindTestUtil(model, name + "fs", feature_select=rfecv).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit())

dir: ./out/20210108/10cv_3class_feature_select_LightGBM_30.rfe
count: 30
feature:
BROC820102 BURA740102 JOND920102 KRIW790102 PONP800107 PRAM820103 QIAN880111 QIAN880115 QIAN880118 TANS770105 
VASM830101 VASM830102 WOLS870102 FUKS010101 GEOR030102 KOSJ950110 KOSJ950115 SIMK990101 len.1 AA20D.A 
AA20D.I AA20D.L AA20D.P AA20D.R AA20D.T AA20D.V NonPolarAA PolarAA ChargedAA PosAA 

10cv
原数据特征数目： 1081
实际数据特征数目： 30
lightGBMfs: 进行交叉验证
特征数目： 30
train数据分布
-1    2798
 0    1929
 1     939
dtype: int64
原有数据数目：2798 : 1929 : 939
------cv 1------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 2------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 3------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 4------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 5------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 6------
cv test ：2518 : 1736 : 845
cv test ：280 : 193 : 94
------cv 7------
cv test ：2518 : 1736 : 846
cv test ：280 : 193 : 93
--

## rfecv

In [9]:
file_name = config["feature_selected"][name]
rfecv = joblib.load(file_name)
print("dir:", file_name)
print("count:", rfecv.n_features_)
print("feature:")
for i, j in enumerate(FEATURE[rfecv.support_]):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

res_names.append("direct fs_rfecv")
print("10cv")
res_cv.append(utils.CVUtil(model, name + "fs", cvs_method, feature_select=rfecv).set_data(X_train, y_train, g_train).fit())
print("blind")
res_blind.append( utils.BlindTestUtil(model, name + "fs", feature_select=rfecv).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit())

dir: ./out/20210108/10cv_3class_feature_select_LightGBM.rfecv
count: 75
feature:
sift hits ARGP820102 BROC820102 BUNA790102 BURA740102 CHOP780204 CHOP780205 CHOP780211 CIDH920102 
DAYM780201 FASG760103 ISOY800104 ISOY800107 JOND920102 KRIW790102 NAKH920107 OOBM850104 PONP800105 PONP800107 
PRAM820103 QIAN880111 QIAN880115 QIAN880118 RICJ880114 ROBB760107 SNEP660104 TANS770105 TANS770108 VASM830101 
VASM830102 WOLS870101 WOLS870102 FODM020101 FUKS010101 FUKS010107 SUYM030101 GEOR030102 GEOR030103 GEOR030107 
BAEK050101 DIGM050101 FITW660101 NIEK910102 KOSJ950110 KOSJ950115 DOSZ010104 DAYM780302 SIMK990101 SIMK990103 
SIMK990104 ZHAC000102 len.1 AA20D.A AA20D.D AA20D.E AA20D.F AA20D.G AA20D.I AA20D.K 
AA20D.L AA20D.M AA20D.N AA20D.P AA20D.Q AA20D.R AA20D.S AA20D.T AA20D.V AA20D.Y 
NonPolarAA PolarAA ChargedAA PosAA NegAA 
10cv
原数据特征数目： 1081
实际数据特征数目： 75
lightGBMfs: 进行交叉验证
特征数目： 75
train数据分布
-1    2798
 0    1929
 1     939
dtype: int64
原有数据数目：2798 : 1929 : 939
------cv 1------
cv test ：2

# direct + 2layer

In [10]:
name = "lightGBM"
Estimator =  lgb.LGBMClassifier
print(name, Estimator)

lightGBM <class 'lightgbm.sklearn.LGBMClassifier'>


In [11]:
i = -1
res_names.append("2layer")
print("10cv")
_cvUtil = utils.CVUtilLayer2(Estimator, cvs_method, name + "2layer").set_data(X_train, y_train, g_train, i).fit()
res_cv.append(_cvUtil)
print("blind")
_blindUtil = utils.BlindTestUtilLayer2(Estimator, name + "2layer").set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2, i).fit()
res_blind.append(_blindUtil)

kwargs = {"random_state":0, }
layer_direct_model = utils.PonsolLayerEstimator(Estimator,kwargs=kwargs,special_kind=-1)
layer_direct_model.fit(X_train, y_train)
out_path = os.path.join(OUT_PATH, "2layer_directly.model")
print("save to:", out_path)
joblib.dump(layer_direct_model, out_path)
p_test1 = layer_direct_model.predict(X_test1)
p_test2 = layer_direct_model.predict(X_test2)
print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
print("acc test2 =",sum(y_test2 == p_test2) / len(y_test2))

10cv
》》》》》》开始10cv
第一层: -1和非-1
第二层: 0和1
------cv 1------
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：[array(['sift', 'hits', 'ANDN920101', ..., 'ChargedAA', 'PosAA', 'NegAA'],
      dtype=object), array(['sift', 'hits', 'ANDN920101', ..., 'ChargedAA', 'PosAA', 'NegAA'],
      dtype=object)]，第一层区分的类别：-1
输入数据
X 数目: (5099, 1081)
y 分布情况: -1: 0: 1 = 2518: 1736: 845= 1.00: 0.69: 0.34
y= [-1  0 -1 ...  1  1  1]
是否进行使用指定特征： [array(['sift', 'hits', 'ANDN920101', ..., 'ChargedAA', 'PosAA', 'NegAA'],
      dtype=object), array(['sift', 'hits', 'ANDN920101', ..., 'ChargedAA', 'PosAA', 'NegAA'],
      dtype=object)]
第一层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
第二层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
开始训练第一层模型
X 数目: (5099, 1081)
y 分布情况: -1: 0 = 2518: 2581= 1.00: 1.03
y= [-1  0 -1 ...  0  0  0]
开始训练第二层模型
X 数目: (2581, 1081)
y 分布情况: 0: 1 = 1736: 845= 1.00: 0.49
y= [0 0 0 ... 1 1 1]
输入数据数目: (567, 1081)

开始训练第一层模型
X 数目: (5100, 1081)
y 分布情况: -1: 0 = 2518: 2582= 1.00: 1.03
y= [-1 -1  0 ...  0  0  0]
开始训练第二层模型
X 数目: (2582, 1081)
y 分布情况: 0: 1 = 1736: 846= 1.00: 0.49
y= [0 0 0 ... 1 1 0]
输入数据数目: (566, 1081)
第一层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
第二层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
开始第一层预测
输入X数目: (566, 1081)
第一层预测结果分布: -1: 0 = 277: 289= 1.00: 1.04
开始第二层预测
输入X数目: (289, 1081)
第二层预测结果分布: 0: 1 = 234: 55= 1.00: 0.24
合并两次预测结果
最终预测结果分布: -1: 0: 1 = 277: 234: 55= 1.00: 0.84: 0.20
------cv 8------
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：[array(['sift', 'hits', 'ANDN920101', ..., 'ChargedAA', 'PosAA', 'NegAA'],
      dtype=object), array(['sift', 'hits', 'ANDN920101', ..., 'ChargedAA', 'PosAA', 'NegAA'],
      dtype=object)]，第一层区分的类别：-1
输入数据
X 数目: (5100, 1081)
y 分布情况: -1: 0: 1 = 2518: 1737: 845= 1.00: 0.69: 0.34
y= [-1 -1  0 ...  1  0  1]
是否进行使用指定特征： [array(['sift', 'hits', 'ANDN920101'

训练结果：
test2 acc: 0.6676737160120846, gc2: 0.162326077403396
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：None，第一层区分的类别：-1
输入数据
X 数目: (5666, 1081)
y 分布情况: -1: 0: 1 = 2798: 1929: 939= 1.00: 0.69: 0.34
y= [-1 -1  0 ...  1  0  1]
是否进行使用指定特征： None
第一层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
第二层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
开始训练第一层模型
X 数目: (5666, 1081)
y 分布情况: -1: 0 = 2798: 2868= 1.00: 1.03
y= [-1 -1  0 ...  0  0  0]
开始训练第二层模型
X 数目: (2868, 1081)
y 分布情况: 0: 1 = 1929: 939= 1.00: 0.49
y= [0 0 0 ... 1 0 1]
save to: .\out\20210525\2layer_directly.model
输入数据数目: (46, 1081)
第一层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
第二层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
开始第一层预测
输入X数目: (46, 1081)
第一层预测结果分布: -1: 0 = 17: 29= 1.00: 1.71
开始第二层预测
输入X数目: (29, 1081)
第二层预测结果分布: 0: 1 = 27: 2= 1.00: 0.07
合并两次预测结果
最终预测结果分布: -1: 0: 1 = 17: 27: 2= 1

# 2layer + fs

## rfe

In [12]:
kind = -1
print(">> for {}：{}".format(kind, [j for j in [-1, 0, 1] if j != kind]))
_path1 = config["feature_selected"]["lightGBM_layer1_{}_ref_30".format(kind)]
_path2 = config["feature_selected"]["lightGBM_layer2_{}_ref_30".format(kind)]
print("rfecv ：")
print("layer1:", _path1)
print("layer2:", _path2)
rfecv_layer1 = joblib.load(_path1)
rfecv_layer2 = joblib.load(_path2)
_feature_1 = FEATURE[rfecv_layer1.support_]
_feature_2 = FEATURE[rfecv_layer2.support_]

print("# test{}".format(kind))
res_names.append("2layer fs_rfe_{}_{}".format(len(_feature_1), len(_feature_2)))
print("10cv")
_cvUtil = utils.CVUtilLayer2(Estimator, cvs_method, name).set_data(X_train, y_train, g_train, kind,_feature_1, _feature_2).fit()
res_cv.append(_cvUtil)
print("blind")
_blindUtil = utils.BlindTestUtilLayer2(Estimator, name).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2, kind,_feature_1,_feature_2).fit()
res_blind.append(_blindUtil)


kwargs = {"random_state":0, }
layer_rfe_model = utils.PonsolLayerEstimator(Estimator,kwargs=kwargs, special_kind=-1, feature_selected=[_feature_1, _feature_2])
layer_rfe_model.fit(X_train, y_train)
out_path = os.path.join(OUT_PATH, "2layer_rfe.model")
print("save to:", out_path)
joblib.dump(layer_rfe_model, out_path)
p_test1 = layer_rfe_model.predict(X_test1)
p_test2 = layer_rfe_model.predict(X_test2)
print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
print("acc test2 =",sum(y_test2 == p_test2) / len(y_test2))

>> for -1：[0, 1]
rfecv ：
layer1: ./out/20210106/lightGBM_feature_select_for_-1_layer1_30_features.rfe
layer2: ./out/20210106/lightGBM_feature_select_for_-1_layer2_30_features.rfe
# test-1
10cv
》》》》》》开始10cv
第一层: -1和非-1
第二层: 0和1
------cv 1------
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：[0             sift
1             hits
4       ARGP820102
34      CHOP780209
112     JOND920102
           ...    
1072       AA20D.T
1073       AA20D.V
1076    NonPolarAA
1077       PolarAA
1079         PosAA
Length: 30, dtype: object, 9       BHAR880101
14      BROC820102
38      CHOP780213
53      DAYM780201
61      FASG760103
           ...    
1065       AA20D.L
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
Length: 30, dtype: object]，第一层区分的类别：-1
输入数据
X 数目: (5099, 1081)
y 分布情况: -1: 0: 1 = 2518: 1736: 845= 1.00: 0.69: 0.34
y= [-1  0 -1 ...  1  1  1]
是否进行使用指定特征： [0             sift
1             hits
4       ARGP820102
34      CHOP780209

开始训练第二层模型
X 数目: (2581, 30)
y 分布情况: 0: 1 = 1736: 845= 1.00: 0.49
y= [0 0 0 ... 1 0 1]
输入数据数目: (567, 1081)
第一层使用 30 个特征: 0             sift
1             hits
4       ARGP820102
34      CHOP780209
112     JOND920102
           ...    
1072       AA20D.T
1073       AA20D.V
1076    NonPolarAA
1077       PolarAA
1079         PosAA
Length: 30, dtype: object
第二层使用 30 个特征: 9       BHAR880101
14      BROC820102
38      CHOP780213
53      DAYM780201
61      FASG760103
           ...    
1065       AA20D.L
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
Length: 30, dtype: object
开始第一层预测
输入X数目: (567, 30)
第一层预测结果分布: -1: 0 = 273: 294= 1.00: 1.08
开始第二层预测
输入X数目: (294, 30)
第二层预测结果分布: 0: 1 = 236: 58= 1.00: 0.25
合并两次预测结果
最终预测结果分布: -1: 0: 1 = 273: 236: 58= 1.00: 0.86: 0.21
------cv 5------
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：[0             sift
1             hits
4       ARGP820102
34      CHOP780209
112     JOND920102
           ...  

------cv 8------
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：[0             sift
1             hits
4       ARGP820102
34      CHOP780209
112     JOND920102
           ...    
1072       AA20D.T
1073       AA20D.V
1076    NonPolarAA
1077       PolarAA
1079         PosAA
Length: 30, dtype: object, 9       BHAR880101
14      BROC820102
38      CHOP780213
53      DAYM780201
61      FASG760103
           ...    
1065       AA20D.L
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
Length: 30, dtype: object]，第一层区分的类别：-1
输入数据
X 数目: (5100, 1081)
y 分布情况: -1: 0: 1 = 2518: 1737: 845= 1.00: 0.69: 0.34
y= [-1 -1  0 ...  1  0  1]
是否进行使用指定特征： [0             sift
1             hits
4       ARGP820102
34      CHOP780209
112     JOND920102
           ...    
1072       AA20D.T
1073       AA20D.V
1076    NonPolarAA
1077       PolarAA
1079         PosAA
Length: 30, dtype: object, 9       BHAR880101
14      BROC820102
38      CHOP780213
53      D

开始训练第二层模型
X 数目: (2868, 30)
y 分布情况: 0: 1 = 1929: 939= 1.00: 0.49
y= [0 0 0 ... 1 0 1]
输入数据数目: (46, 1081)
第一层使用 30 个特征: 0             sift
1             hits
4       ARGP820102
34      CHOP780209
112     JOND920102
           ...    
1072       AA20D.T
1073       AA20D.V
1076    NonPolarAA
1077       PolarAA
1079         PosAA
Length: 30, dtype: object
第二层使用 30 个特征: 9       BHAR880101
14      BROC820102
38      CHOP780213
53      DAYM780201
61      FASG760103
           ...    
1065       AA20D.L
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
Length: 30, dtype: object
开始第一层预测
输入X数目: (46, 30)
第一层预测结果分布: -1: 0 = 11: 35= 1.00: 3.18
开始第二层预测
输入X数目: (35, 30)
第二层预测结果分布: 0: 1 = 26: 9= 1.00: 0.35
合并两次预测结果
最终预测结果分布: -1: 0: 1 = 11: 26: 9= 1.00: 2.36: 0.82
训练结果：
test1 acc: 0.34782608695652173, gc2: 0.01064991405900497
输入数据数目: (662, 1081)
第一层使用 30 个特征: 0             sift
1             hits
4       ARGP820102
34      CHOP780209
112     JOND920102
           ...    
1072   

## rfecv

In [13]:
kind = -1
print(">> 针对{}：{}".format(kind, [j for j in [-1, 0, 1] if j != kind]))

_path1 = config["feature_selected"]["lightGBM_layer1_{}".format(kind)]
_path2 = config["feature_selected"]["lightGBM_layer2_{}".format(kind)]
print("rfecv layer1 ：", _path1)
print("rfecv layer2 ：", _path2)
print("layer1:", _path1)
print("layer2:", _path2)
rfecv_layer1 = joblib.load(_path1)
rfecv_layer2 = joblib.load(_path2)
_feature_1 = FEATURE[rfecv_layer1.support_]
_feature_2 = FEATURE[rfecv_layer2.support_]


print("layer1")
print("count：", len(_feature_1))
print("feature：")
for i, j in enumerate(_feature_1):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

print("layer2")
print("count：", len(_feature_2))
print("feature：")
for i, j in enumerate(_feature_2):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

print("# test{}".format(kind))
res_names.append("2layer fs_rfecv")
print("10cv")
_cvUtil = utils.CVUtilLayer2(Estimator, cvs_method, name).set_data(X_train, y_train, g_train, kind,_feature_1, _feature_2).fit()
res_cv.append(_cvUtil)
print("blind")
_blindUtil = utils.BlindTestUtilLayer2(Estimator, name).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2, kind,_feature_1,_feature_2).fit()
res_blind.append(_blindUtil)


kwargs = {"random_state":0, }
layer_rfecv_model = utils.PonsolLayerEstimator(Estimator, kwargs=kwargs, special_kind=-1, feature_selected=[_feature_1, _feature_2])
layer_rfecv_model.fit(X_train, y_train)
out_path = os.path.join(OUT_PATH, "2layer_rfecv.model")
print("save to:", out_path)
joblib.dump(layer_rfecv_model, out_path)

p_test1 = layer_rfecv_model.predict(X_test1)
p_test2 = layer_rfecv_model.predict(X_test2)
print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
print("acc test2 =",sum(y_test2 == p_test2) / len(y_test2))

>> 针对-1：[0, 1]
rfecv layer1 ： ./out/20210106/lightGBM_feature_select_for_-1_layer1.rfecv
rfecv layer2 ： ./out/20210106/lightGBM_feature_select_for_-1_layer2.rfecv
layer1: ./out/20210106/lightGBM_feature_select_for_-1_layer1.rfecv
layer2: ./out/20210106/lightGBM_feature_select_for_-1_layer2.rfecv
layer1
count： 89
feature：
sift hits ANDN920101 ARGP820102 BROC820102 BULH740101 BURA740102 CHOP780205 CHOP780209 CHOP780215 
GARJ730101 ISOY800104 ISOY800105 JOND920102 KARP850103 KRIW790102 MAXF760103 NAKH920104 OOBM770102 OOBM850102 
OOBM850104 PALJ810115 PONP800104 PONP800105 PONP800107 PRAM820103 QIAN880118 QIAN880129 QIAN880134 QIAN880139 
RACS820111 RICJ880104 RICJ880114 SNEP660101 TANS770105 VASM830102 ZIMJ680102 ZIMJ680104 MUNV940105 BLAM930101 
FUKS010101 FUKS010105 SUYM030101 GEOR030101 GEOR030102 GEOR030107 GEOR030108 BAEK050101 DIGM050101 BENS940104 
FITW660101 LUTR910106 KOSJ950109 KOSJ950110 KOSJ950114 KOSJ950115 DOSZ010103 GIAG010101 DAYM780302 QUIB020101 
MOOG990101 SIMK990104 Z

开始训练第二层模型
X 数目: (2581, 32)
y 分布情况: 0: 1 = 1736: 845= 1.00: 0.49
y= [0 0 0 ... 1 0 1]
输入数据数目: (567, 1081)
第一层使用 89 个特征: 0             sift
1             hits
2       ANDN920101
4       ARGP820102
14      BROC820102
           ...    
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
1080         NegAA
Length: 89, dtype: object
第二层使用 32 个特征: 9       BHAR880101
14      BROC820102
38      CHOP780213
53      DAYM780201
61      FASG760103
           ...    
1065       AA20D.L
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
Length: 32, dtype: object
开始第一层预测
输入X数目: (567, 89)
第一层预测结果分布: -1: 0 = 288: 279= 1.00: 0.97
开始第二层预测
输入X数目: (279, 32)
第二层预测结果分布: 0: 1 = 213: 66= 1.00: 0.31
合并两次预测结果
最终预测结果分布: -1: 0: 1 = 288: 213: 66= 1.00: 0.74: 0.23
------cv 4------
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：[0             sift
1             hits
2       ANDN920101
4       ARGP820102
14      BROC820102
           ...  

------cv 7------
初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：[0             sift
1             hits
2       ANDN920101
4       ARGP820102
14      BROC820102
           ...    
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
1080         NegAA
Length: 89, dtype: object, 9       BHAR880101
14      BROC820102
38      CHOP780213
53      DAYM780201
61      FASG760103
           ...    
1065       AA20D.L
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
Length: 32, dtype: object]，第一层区分的类别：-1
输入数据
X 数目: (5100, 1081)
y 分布情况: -1: 0: 1 = 2518: 1736: 846= 1.00: 0.69: 0.34
y= [-1 -1  0 ...  1  1  0]
是否进行使用指定特征： [0             sift
1             hits
2       ANDN920101
4       ARGP820102
14      BROC820102
           ...    
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
1080         NegAA
Length: 89, dtype: object, 9       BHAR880101
14      BROC820102
38      CHOP780213
53      D

开始训练第二层模型
X 数目: (2581, 32)
y 分布情况: 0: 1 = 1736: 845= 1.00: 0.49
y= [0 0 0 ... 1 0 1]
输入数据数目: (566, 1081)
第一层使用 89 个特征: 0             sift
1             hits
2       ANDN920101
4       ARGP820102
14      BROC820102
           ...    
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
1080         NegAA
Length: 89, dtype: object
第二层使用 32 个特征: 9       BHAR880101
14      BROC820102
38      CHOP780213
53      DAYM780201
61      FASG760103
           ...    
1065       AA20D.L
1076    NonPolarAA
1077       PolarAA
1078     ChargedAA
1079         PosAA
Length: 32, dtype: object
开始第一层预测
输入X数目: (566, 89)
第一层预测结果分布: -1: 0 = 285: 281= 1.00: 0.99
开始第二层预测
输入X数目: (281, 32)
第二层预测结果分布: 0: 1 = 226: 55= 1.00: 0.24
合并两次预测结果
最终预测结果分布: -1: 0: 1 = 285: 226: 55= 1.00: 0.79: 0.19
accuracy
0.746 0.765 0.781 0.760 0.743 0.741 0.767 0.761 0.779 0.737
mean: 0.758032481412929

gc2
0.319 0.350 0.378 0.354 0.333 0.320 0.358 0.338 0.374 0.299
mean: 0.3422469383393262
blind
》》》》》》开始 blind test

# save result

In [15]:
utils.result_output(res_cv, res_blind, res_names, OUT_PATH, "10cv_3class_finally", False)  
utils.result_output(res_cv, res_blind, res_names, OUT_PATH, "10cv_3class_finally", True)  

输出路径为: .\out\20210525\10cv_3class_finally_res.xlsx
输出路径为: .\out\20210525\10cv_3class_finally_res_balanced.xlsx


(            acc                                                              \
          direct direct fs_rfe direct fs_rfecv    2layer 2layer fs_rfe_30_30   
 10cv   0.649793      0.649891        0.662415  0.650916            0.653980   
 test1  0.202020      0.401515        0.146465  0.207071            0.305556   
 test2  0.542766      0.517303        0.524902  0.531594            0.538950   
 
                             gc2                                          \
       2layer fs_rfecv    direct direct fs_rfe direct fs_rfecv    2layer   
 10cv         0.666631  0.297646      0.289494        0.311546  0.312696   
 test1        0.217172  0.105189      0.154427        0.115253  0.073702   
 test2        0.527811  0.149989      0.140775        0.151531  0.140890   
 
                                            
       2layer fs_rfe_30_30 2layer fs_rfecv  
 10cv             0.317140        0.333384  
 test1            0.010676        0.081469  
 test2            0.158884        0.

# save classfier

In [16]:
Estimator = lgb.LGBMClassifier
kwargs = {"random_state":0, }
layer_estimator = utils.PonsolLayerEstimator(Estimator, kwargs=kwargs, special_kind=-1)
layer_estimator.fit(X_train, y_train)

out_path = os.path.join(OUT_PATH, "ponsol2.model")
print("save to:", out_path)
joblib.dump(layer_estimator, out_path)

初始化完毕，模型：<class 'lightgbm.sklearn.LGBMClassifier'>，模型参数：{'random_state': 0}，特征：None，第一层区分的类别：-1
输入数据
X 数目: (5666, 1081)
y 分布情况: -1: 0: 1 = 2798: 1929: 939= 1.00: 0.69: 0.34
y= [-1 -1  0 ...  1  0  1]
是否进行使用指定特征： None
第一层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
第二层使用 1081 个特征: ['sift' 'hits' 'ANDN920101' ... 'ChargedAA' 'PosAA' 'NegAA']
开始训练第一层模型
X 数目: (5666, 1081)
y 分布情况: -1: 0 = 2798: 2868= 1.00: 1.03
y= [-1 -1  0 ...  0  0  0]
开始训练第二层模型
X 数目: (2868, 1081)
y 分布情况: 0: 1 = 1929: 939= 1.00: 0.49
y= [0 0 0 ... 1 0 1]
save to: .\out\20210525\ponsol2.model


['.\\out\\20210525\\ponsol2.model']