In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, average_precision_score
from sklearn.ensemble import RandomForestClassifier
# import tqdm
warnings.filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('originalDataset/train.csv')
data_test_a = pd.read_csv('originalDataset/test.csv')

In [3]:
features = [f for f in data_train.columns if f not in ['CaseId', 'Evaluation', "policyCode", "issueDate", "earliesCreditLine"] and '_outliers' not in f]
y_train = data_train['Evaluation']
x_train = data_train[features]
x_test = data_test_a[features]

In [14]:
def aucprFunction(pred, train_data):
    '''
    如果是给lgbm用的，参考https://github.com/microsoft/LightGBM/blob/c02917e493c36f3b1e349338f1087fed33126576/examples/python-guide/advanced_example.py#L154
    第一个返回值，是这个函数的可以说是名字或者是标记吧；score就是得到的分数；最后一个就是问，score是越高越好吗。
    '''
    labels = train_data.get_label()
    score = average_precision_score(labels, pred)
    return "AUC_PR", score, True

class AUCPR_forCAT(object):
    '''
    https://catboost.ai/docs/concepts/python-usages-examples.html#custom-loss-function-eval-metric
    '''
    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is a one dimensional indexed container.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)

        ## weight没什么迪奥用。
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        approx = approxes[0]
        score = average_precision_score(list(target), approx)
        return score, -1
    
    def get_final_error(self, error, weight):
        # Returns final value of metric based on error and weight
        return error

def cv_model(clf, train_x, train_y, test_x, clf_name, folds = 5):
    # folds = 5
    seed = 2020
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt', 'objective': 'binary',
                # 'metric': 'auc',
                'min_child_weight': 5, 'num_leaves': 2 ** 5, 'lambda_l2': 10, 'feature_fraction': 0.8,
                'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2020,
                'nthread': 28, 'n_jobs':24, 'silent': True, 'verbose': -1,    
#                 'device_type': 'gpu',
#                 'max_bin': 63,
            }
            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200, feval=aucprFunction)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'aucpr', 'gamma': 1,
                      'min_child_weight': 1.5, 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7,
                      'eta': 0.04, 'tree_method': 'exact', # "gpu_hist", #
                      'seed': 2020, 'nthread': 36, "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            # https://stackoverflow.com/questions/55579610/xgboost-attributeerror-dataframe-object-has-no-attribute-feature-names
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(xgb.DMatrix(test_x) , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            ## metrics函数：https://catboost.ai/docs/references/eval-metric__supported-metrics.html 
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False, "task_type": "GPU"} #, "eval_metric": "Kappa"
            
            model = clf(iterations=20000, **params)#, eval_metric = AUCPR_forCAT()
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        ## 下面这个test，可以考虑像如下这样进行，或者是求平均亦可。
        # test += test_pred / kf.n_splits ## Minke's invention, no basis or reference.
        test += test_pred
        ##
        train[valid_index] = val_pred
        cv_scores.append(average_precision_score(val_y, val_pred))
        
        print(sum(cv_scores)/len(cv_scores), cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    print("what is kf.n_splits?", kf.n_splits)
    return train, test / kf.n_splits

In [15]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

In [17]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.159898	training's AUC_PR: 0.86351	valid_1's binary_logloss: 0.16623	valid_1's AUC_PR: 0.853332
[400]	training's binary_logloss: 0.15819	training's AUC_PR: 0.865628	valid_1's binary_logloss: 0.165701	valid_1's AUC_PR: 0.853618
Early stopping, best iteration is:
[371]	training's binary_logloss: 0.158361	training's AUC_PR: 0.865472	valid_1's binary_logloss: 0.165688	valid_1's AUC_PR: 0.853824
0.8538241432478658 [0.8538241432478658]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.160493	training's AUC_PR: 0.862722	valid_1's binary_logloss: 0.163383	valid_1's AUC_PR: 0.856771
[400]	training's binary_logloss: 0.158726	training's AUC_PR: 0.864915	valid_1's binary_logloss: 0.162932	valid_1's AUC_PR: 0