In [None]:
from collections import defaultdict
from pprint import pprint
import time
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from balance_date import BalanceDate

import utils
pd.options.display.max_rows = 10  

%matplotlib inline


from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.model_selection import train_test_split
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)  
from sklearn.externals import joblib  
from sklearn.feature_selection import RFE,RFECV  

from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb


from tqdm import tqdm, trange
with open("./config.json") as f:
    config = json.loads(f.read())

In [None]:
FEATURE = BalanceDate.get_feature()  
OUT_PATH = os.path.join(".", "out", time.strftime("%Y%m%d"))  
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)
print("feature：", FEATURE)
print("out path：", OUT_PATH)

# read data

In [None]:
cv_kind = 1
train_set_kind = 1

X_test1, y_test1, X_test2, y_test2 = BalanceDate.get_test_values()  
X_train, y_train, g_train = BalanceDate.get_train_values(train_set_kind)  
cvs = BalanceDate.split_cv(kind=cv_kind, train_set_kind=train_set_kind)
cvs_method = BalanceDate.split_cv(kind=cv_kind, train_set_kind=train_set_kind, return_kind=2)

print("test1")
utils.solubility_distribute(y_test1, 0, 1)
print("test2")
utils.solubility_distribute(y_test2, 0, 1)
print("train")
utils.solubility_distribute(y_train, 0, 1)

# read model

In [None]:
name = "lightGBM"
model =  lgb.LGBMClassifier(random_state=0)  # gbm     
print(name, model)

# direct

In [None]:
res_cv = []  
res_blind = []  
res_names = []  

In [None]:
res_names.append("direct")
print("10cv")
res_cv.append( utils.CVUtil(model, name + "direct", cvs_method).set_data(X_train, y_train, g_train).fit())
print("blind")
res_blind.append( utils.BlindTestUtil(model, name + "direct").set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit() )

# after fs

## rfe

In [None]:
def direct_rfe(file_path, kind=-1):
    rfecv = joblib.load(file_path)
    print("dir:", file_path)
    print("count：", rfecv.n_features_)
    print("feature：")
    for i, j in enumerate(FEATURE[rfecv.support_]):
        print(j, end=" ")
        if (i + 1) % 10 == 0:
            print()
    print()
    name = "direct_fs_rf_{}".format(rfecv.n_features_)

    print("10cv")
    _cv_res = utils.CVUtil(model, name + "fs", cvs_method, feature_select=rfecv).set_data(X_train, y_train, g_train).fit()
    print("blind")
    _blind_res = utils.BlindTestUtil(model, name + "fs", feature_select=rfecv).set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2).fit()
    
    return name, _cv_res, _blind_res

In [None]:
# 100
r = direct_rfe(config["feature_selected"]["LightGBM_rfe_100"], -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# 50
r = direct_rfe(config["feature_selected"]["LightGBM_rfe_50"], -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# 30
r = direct_rfe(config["feature_selected"]["LightGBM_rfe_30"], -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# 20
r = direct_rfe(config["feature_selected"]["LightGBM_rfe_20"], -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# direct 2layer

In [None]:
name = "lightGBM"
Estimator =  lgb.LGBMClassifier
print(name, Estimator)

In [None]:
i = -1
res_names.append("2layer")
print("10cv")
_cvUtil = utils.CVUtilLayer2(Estimator, cvs_method, name + "2layer").set_data(X_train, y_train, g_train, i).fit()
res_cv.append(_cvUtil)
print("blind")
_blindUtil = utils.BlindTestUtilLayer2(Estimator, name + "2layer").set_data(X_train, y_train, X_test1, y_test1, X_test2, y_test2, i).fit()
res_blind.append(_blindUtil)

kwargs = {"random_state":0, }
layer_direct_model = utils.PonsolLayerEstimator(Estimator,kwargs=kwargs,special_kind=-1)
layer_direct_model.fit(X_train, y_train)
out_path = os.path.join(OUT_PATH, "2layer_directly.model")
print("save to:", out_path)
joblib.dump(layer_direct_model, out_path)

p_test1 = layer_direct_model.predict(X_test1)
p_test2 = layer_direct_model.predict(X_test2)
print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
print("acc test2 =",sum(y_test2 == p_test2) / len(y_test2))

# 2layer + fs

## rfe

In [None]:
def layer_rfe(fs_path, fs_base_path, kind=-1):
    is_direct = (fs_base_path is None)
    print(">> for{}：{}".format(kind, [j for j in [-1, 0, 1] if j != kind]))
    if is_direct:
        _path1, _path2 = fs_path
        rfecv_layer1 = joblib.load(_path1)
        rfecv_layer2 = joblib.load(_path2)
        _feature_1 = FEATURE[rfecv_layer1.support_]
        _feature_2 = FEATURE[rfecv_layer2.support_]

        print("rfecv：")
        print("layer1:", _path1)
        print("count：", len(_feature_1))
        print("feature：")
        for i, j in enumerate(_feature_1):
            print(j, end=" ")
            if (i + 1) % 10 == 0:
                print()
        print()
        print("layer2:", _path2)
        print("count：", len(_feature_2))
        print("feature：")
        for i, j in enumerate(_feature_2):
            print(j, end=" ")
            if (i + 1) % 10 == 0:
                print()
        print()

    else:
        _fs_special1_path, _fs_special2_path = fs_path
        _fs_base1_path, _fs_base2_path = fs_base_path
        _feature_1 = FEATURE[joblib.load(_fs_base1_path).support_][joblib.load(
            _fs_special1_path).support_]

        _feature_2 = FEATURE[joblib.load(_fs_base2_path).support_][joblib.load(
            _fs_special2_path).support_]
        print("layer1")
        print("count：", len(_feature_1))
        print("feature：")
        for i, j in enumerate(_feature_1):
            print(j, end=" ")
            if (i + 1) % 10 == 0:
                print()
        print()

        print("layer2")
        print("count：", len(_feature_2))
        print("feature：")
        for i, j in enumerate(_feature_2):
            print(j, end=" ")
            if (i + 1) % 10 == 0:
                print()
        print()

    name = "2layer fs_rfe_{}_{}".format(len(_feature_1), len(_feature_2))
    print("# for{}".format(kind))
    print("10cv")
    _cvUtil = utils.CVUtilLayer2(Estimator, cvs_method,
                                 name).set_data(X_train, y_train, g_train,
                                                kind, _feature_1,
                                                _feature_2).fit()
    print("blind")
    _blindUtil = utils.BlindTestUtilLayer2(Estimator, name).set_data(
        X_train, y_train, X_test1, y_test1, X_test2, y_test2, kind, _feature_1,
        _feature_2).fit()

    kwargs = {
        "random_state": 0,
    }
    layer_rfe_model = utils.PonsolLayerEstimator(
        Estimator,
        kwargs=kwargs,
        special_kind=-1,
        feature_selected=[_feature_1, _feature_2])
    layer_rfe_model.fit(X_train, y_train)
    out_path = os.path.join(
        OUT_PATH, "2layer_rfe_{}_{}.model".format(len(_feature_1),
                                                  len(_feature_2)))
    print("path:", out_path)
    joblib.dump(layer_rfe_model, out_path)
    
    p_test1 = layer_rfe_model.predict(X_test1)
    p_test2 = layer_rfe_model.predict(X_test2)
    print("acc test1 =", sum(y_test1 == p_test1) / len(y_test1))
    print("acc test2 =", sum(y_test2 == p_test2) / len(y_test2))

    return name, _cvUtil, _blindUtil

In [None]:
# 100
r = layer_rfe([
    config["feature_selected"]["lightGBM_layer1_-1_ref_100"],
    config["feature_selected"]["lightGBM_layer2_-1_ref_100"]
], None, -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# 50
r = layer_rfe([
    config["feature_selected"]["lightGBM_layer1_-1_ref_50"],
    config["feature_selected"]["lightGBM_layer2_-1_ref_50"]
], None, -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# 30
r = layer_rfe([
    config["feature_selected"]["lightGBM_layer1_-1_ref_30"],
    config["feature_selected"]["lightGBM_layer2_-1_ref_30"]
], None, -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# 20
# r = layer_rfe([
#     config["feature_selected"]["lightGBM_layer1_-1_ref_20"],
#     config["feature_selected"]["lightGBM_layer2_-1_ref_20"]
# ], None, -1)
# res_names.append(r[0])
# res_cv.append(r[1])
# res_blind.append(r[2])

# 20 plus
r = layer_rfe([
    config["feature_selected_special"]["lightGBM_ref_specia-l_layer1"][1],
    config["feature_selected_special"]["lightGBM_ref_specia-l_layer2"][1],
], [
    config["feature_selected"][config["feature_selected_special"]
                               ["lightGBM_ref_specia-l_layer1"][0]],
    config["feature_selected"][config["feature_selected_special"]
                               ["lightGBM_ref_specia-l_layer2"][0]],
], -1)
res_names.append(r[0])
res_cv.append(r[1])
res_blind.append(r[2])

# save result

In [None]:
utils.result_output(res_cv, res_blind, res_names, OUT_PATH, "10cv_3class_finally", False)  
utils.result_output(res_cv, res_blind, res_names, OUT_PATH, "10cv_3class_finally", True)  