In [None]:
from collections import defaultdict
from pprint import pprint
import time
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# matplotlib config
pd.options.display.max_rows = 10  
%matplotlib inline

# ML
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.model_selection import train_test_split
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)  # 10cv shuffle 
from sklearn.externals import joblib  # save model
from sklearn.feature_selection import RFE,RFECV  # fs
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

from tqdm import tqdm, trange
with open("./config.json") as f:
    config = json.loads(f.read())
    

# import data
from balance_date import BalanceDate
# custom tool
import utils

In [None]:
FEATURE = BalanceDate.get_feature()  
OUT_PATH = os.path.join(".", "out", time.strftime("%Y%m%d")) 
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)
print("features：", FEATURE)
print("out path：", OUT_PATH)

# read data

In [None]:
cv_kind = 1
train_set_kind = 1

X_test1, y_test1, X_test2, y_test2 = BalanceDate.get_test_values()  
X_train, y_train, g_train = BalanceDate.get_train_values(train_set_kind)  
cvs = BalanceDate.split_cv(kind=cv_kind, train_set_kind=train_set_kind)
cvs_method = BalanceDate.split_cv(kind=cv_kind, train_set_kind=train_set_kind, return_kind=2)

print("test1")
utils.solubility_distribute(y_test1, 0, 1)
print("test2")
utils.solubility_distribute(y_test2, 0, 1)
print("train")
utils.solubility_distribute(y_train, 0, 1)

# read model

In [None]:
name = "lightGBM"
model =  lgb.LGBMClassifier(random_state=0)  # gbm     
print(name, model)

# direct

In [None]:
res_cv = []  # cv 
res_blind = []  # blind 
res_names = []  # name

In [None]:
res_names.append("direct")
print("10cv")
res_cv.append( utils.CVUtil(model, name + "direct", cvs_method).set_data(X_train, y_train, g_train).fit())
print("blind")
res_blind.append( utils.BlindTestUtil(model, name + "direct").set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit() )

# after fs

## rfe

In [None]:
# 特征筛选的信息
file_name = config["feature_selected"]["LightGBM_rfe_30"]
rfecv = joblib.load(file_name)
print("dir:", file_name)
print("count:", rfecv.n_features_)
print("feature:")
for i, j in enumerate(FEATURE[rfecv.support_]):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

res_names.append("direct fs_rfe")
print("10cv")
res_cv.append(utils.CVUtil(model, name + "fs", cvs_method, feature_select=rfecv).set_data(X_train, y_train, g_train).fit())
print("blind")
res_blind.append( utils.BlindTestUtil(model, name + "fs", feature_select=rfecv).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit())

## rfecv

In [None]:
file_name = config["feature_selected"][name]
rfecv = joblib.load(file_name)
print("dir:", file_name)
print("count:", rfecv.n_features_)
print("feature:")
for i, j in enumerate(FEATURE[rfecv.support_]):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

res_names.append("direct fs_rfecv")
print("10cv")
res_cv.append(utils.CVUtil(model, name + "fs", cvs_method, feature_select=rfecv).set_data(X_train, y_train, g_train).fit())
print("blind")
res_blind.append( utils.BlindTestUtil(model, name + "fs", feature_select=rfecv).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit())

# direct + 2layer

In [None]:
name = "lightGBM"
Estimator =  lgb.LGBMClassifier
print(name, Estimator)

In [None]:
i = -1
res_names.append("2layer")
print("10cv")
_cvUtil = utils.CVUtilLayer2(Estimator, cvs_method, name + "2layer").set_data(X_train, y_train, g_train, i).fit()
res_cv.append(_cvUtil)
print("blind")
_blindUtil = utils.BlindTestUtilLayer2(Estimator, name + "2layer").set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2, i).fit()
res_blind.append(_blindUtil)

kwargs = {"random_state":0, }
layer_direct_model = utils.PonsolLayerEstimator(Estimator,kwargs=kwargs,special_kind=-1)
layer_direct_model.fit(X_train, y_train)
out_path = os.path.join(OUT_PATH, "2layer_directly.model")
print("save to:", out_path)
joblib.dump(layer_direct_model, out_path)
p_test1 = layer_direct_model.predict(X_test1)
p_test2 = layer_direct_model.predict(X_test2)
print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
print("acc test2 =",sum(y_test2 == p_test2) / len(y_test2))

# 2layer + fs

## rfe

In [None]:
kind = -1
print(">> for {}：{}".format(kind, [j for j in [-1, 0, 1] if j != kind]))
_path1 = config["feature_selected"]["lightGBM_layer1_{}_ref_30".format(kind)]
_path2 = config["feature_selected"]["lightGBM_layer2_{}_ref_30".format(kind)]
print("rfecv ：")
print("layer1:", _path1)
print("layer2:", _path2)
rfecv_layer1 = joblib.load(_path1)
rfecv_layer2 = joblib.load(_path2)
_feature_1 = FEATURE[rfecv_layer1.support_]
_feature_2 = FEATURE[rfecv_layer2.support_]

print("# test{}".format(kind))
res_names.append("2layer fs_rfe_{}_{}".format(len(_feature_1), len(_feature_2)))
print("10cv")
_cvUtil = utils.CVUtilLayer2(Estimator, cvs_method, name).set_data(X_train, y_train, g_train, kind,_feature_1, _feature_2).fit()
res_cv.append(_cvUtil)
print("blind")
_blindUtil = utils.BlindTestUtilLayer2(Estimator, name).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2, kind,_feature_1,_feature_2).fit()
res_blind.append(_blindUtil)


kwargs = {"random_state":0, }
layer_rfe_model = utils.PonsolLayerEstimator(Estimator,kwargs=kwargs, special_kind=-1, feature_selected=[_feature_1, _feature_2])
layer_rfe_model.fit(X_train, y_train)
out_path = os.path.join(OUT_PATH, "2layer_rfe.model")
print("save to:", out_path)
joblib.dump(layer_rfe_model, out_path)
p_test1 = layer_rfe_model.predict(X_test1)
p_test2 = layer_rfe_model.predict(X_test2)
print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
print("acc test2 =",sum(y_test2 == p_test2) / len(y_test2))

## rfecv

In [None]:
kind = -1
print(">> 针对{}：{}".format(kind, [j for j in [-1, 0, 1] if j != kind]))

_path1 = config["feature_selected"]["lightGBM_layer1_{}".format(kind)]
_path2 = config["feature_selected"]["lightGBM_layer2_{}".format(kind)]
print("rfecv layer1 ：", _path1)
print("rfecv layer2 ：", _path2)
print("layer1:", _path1)
print("layer2:", _path2)
rfecv_layer1 = joblib.load(_path1)
rfecv_layer2 = joblib.load(_path2)
_feature_1 = FEATURE[rfecv_layer1.support_]
_feature_2 = FEATURE[rfecv_layer2.support_]


print("layer1")
print("count：", len(_feature_1))
print("feature：")
for i, j in enumerate(_feature_1):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

print("layer2")
print("count：", len(_feature_2))
print("feature：")
for i, j in enumerate(_feature_2):
    print(j, end=" ")
    if (i + 1) % 10 == 0:
        print()
print()

print("# test{}".format(kind))
res_names.append("2layer fs_rfecv")
print("10cv")
_cvUtil = utils.CVUtilLayer2(Estimator, cvs_method, name).set_data(X_train, y_train, g_train, kind,_feature_1, _feature_2).fit()
res_cv.append(_cvUtil)
print("blind")
_blindUtil = utils.BlindTestUtilLayer2(Estimator, name).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2, kind,_feature_1,_feature_2).fit()
res_blind.append(_blindUtil)


kwargs = {"random_state":0, }
layer_rfecv_model = utils.PonsolLayerEstimator(Estimator, kwargs=kwargs, special_kind=-1, feature_selected=[_feature_1, _feature_2])
layer_rfecv_model.fit(X_train, y_train)
out_path = os.path.join(OUT_PATH, "2layer_rfecv.model")
print("save to:", out_path)
joblib.dump(layer_rfecv_model, out_path)

p_test1 = layer_rfecv_model.predict(X_test1)
p_test2 = layer_rfecv_model.predict(X_test2)
print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
print("acc test2 =",sum(y_test2 == p_test2) / len(y_test2))

# save result

In [None]:
utils.result_output(res_cv, res_blind, res_names, OUT_PATH, "10cv_3class_finally", False)  
utils.result_output(res_cv, res_blind, res_names, OUT_PATH, "10cv_3class_finally", True)  

# save classfier

In [None]:
Estimator = lgb.LGBMClassifier
kwargs = {"random_state":0, }
layer_estimator = utils.PonsolLayerEstimator(Estimator, kwargs=kwargs, special_kind=-1)
layer_estimator.fit(X_train, y_train)

out_path = os.path.join(OUT_PATH, "ponsol2.model")
print("save to:", out_path)
joblib.dump(layer_estimator, out_path)