In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import PredefinedSplit
import sys
sys.path.append('.')
import utils



In [2]:
def get_ks_score(prob, label):
    '''
    计算ks得分
    :param pro: 属于坏人的概率
    :param label: 真实标签
    :return: ks得分
    '''
    df = pd.DataFrame(data = {'label': label, 'prob': prob})
    df['prob'] = df['prob'].map(lambda x: round(x, 3))
    total = pd.DataFrame({'total': df.groupby('prob')['label'].count()})
    bad = pd.DataFrame({'bad': df.groupby('prob')['label'].sum()})
    all_data = total.merge(bad, how = 'left', left_index = True, right_index = True)
    all_data['good'] = all_data['total'] - all_data['bad']
    all_data.reset_index(inplace = True)
    all_data['goodCumPer'] = all_data['good'].cumsum() / all_data['good'].sum()
    all_data['badCumPer'] = all_data['bad'].cumsum() / all_data['bad'].sum()
    KS_m = all_data.apply(lambda x: x.goodCumPer - x.badCumPer, axis = 1)
    return max(KS_m)

def my_scoring(self, X, y, sample_weight=None):
    scoring_prob = self.predict_proba(X)[:, 1]
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y, scoring_prob)
    auc = metrics.auc(false_positive_rate, true_positive_rate)
    ks = get_ks_score(scoring_prob, y)
    #print(X.shape)
    #score_need = 2*auc*ks/(auc + ks)
    score_need = (auc + ks)
    return ks

In [25]:
data = pd.read_csv('./cmh_level/level_6_median.csv')
data.values.shape
params = {
    'booster': 'gbtree',
    'objective': 'binary:logitraw',
    'eval_metric': 'auc',
    
    'eta': 0.1,# learning rate 0.2 --
    
    'scale_pos_weight': 34, #2 --
    
    'max_depth': 2,# --
    'min_child_weight': 1.5, #1.6 
    'gamma': 0.5, #越大算法越保守 ??
    'subsample': 0.5, #每棵树随机采样比例 --
    'colsample_bytree': 0.9, #每棵树随机采样的列数的占比 2 --
    
    'max_delta_step': 1, # 每棵树权重改变的最大步长 可用于类别不平衡 logisticregresion --

    'alpha': 0.9, #l1正则化参数 --
    'lambda': 0.6, #l2正则化参数 --

    'silent': 0,
}
train_auc, train_ks = [], []
val_auc, val_ks = [], []
test_auc, test_ks = [], []

data_y = data.pop('label').as_matrix()
data_X = data.as_matrix()
i = 0

for j in range(1):
    for X_train, X_val, X_test, y_train, y_val, y_test in utils.kfold(data_X, data_y, num_fold = 10):
        print(i)
        i = i + 1
        min_max_scaler = MinMaxScaler()
        min_max_scaler.fit(X_train)
        X_train = min_max_scaler.transform(X_train)
        X_val = min_max_scaler.transform(X_val)
        X_test = min_max_scaler.transform(X_test)
        
        dtrain = xgb.DMatrix(X_train, label=y_train) # xgboost data style
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test, label=y_test)
        
        xg_model = xgb.train(params, dtrain, num_boost_round=84) #10-150
        
        y_train_pred_prob = xg_model.predict(dtrain)
        y_val_pred_prob = xg_model.predict(dval)
        y_test_pred_prob = xg_model.predict(dtest)
        temp_auc, temp_ks = utils.model_key_performance(y_train_pred_prob, y_train)
        train_auc.append(temp_auc)
        train_ks.append(temp_ks)
        temp_auc, temp_ks = utils.model_key_performance(y_val_pred_prob, y_val)
        val_auc.append(temp_auc)
        val_ks.append(temp_ks)
        temp_auc, temp_ks = utils.model_key_performance(y_test_pred_prob, y_test)
        test_auc.append(temp_auc)
        test_ks.append(temp_ks)

print(np.mean(train_auc), np.mean(train_ks))
print(np.mean(val_auc), np.mean(val_ks))
print(np.mean(test_auc), np.mean(test_ks))

0
1
2
3
4
5
6
7
8
9
0.851312199018 0.552354978712
0.809249284941 0.508107220161
0.810423293862 0.520493729056


In [23]:
#best 0 1
print(np.mean(train_auc), np.mean(train_ks))
print(np.mean(val_auc), np.mean(val_ks))
print(np.mean(test_auc), np.mean(test_ks))

0.851312199018 0.552354978712
0.809249284941 0.508107220161
0.810423293862 0.520493729056


In [27]:
test_ks

[0.54474756320438622,
 0.4338563592294935,
 0.55427988263809169,
 0.52532790592492085,
 0.51889735471825027,
 0.57165222836864626,
 0.48300455763142336,
 0.55350868037435208,
 0.48001832330190541,
 0.53964443516682326]

In [26]:
print((np.sum(train_auc)-np.max(train_auc)-np.min(train_auc))/8, (np.sum(train_ks)-np.max(train_ks)-np.min(train_ks))/8)
print((np.sum(val_auc)-np.max(val_auc)-np.min(val_auc))/8, (np.sum(val_ks)-np.max(val_ks)-np.min(val_ks))/8)
print((np.sum(test_auc)-np.max(test_auc)-np.min(test_auc))/8, (np.sum(test_ks)-np.max(test_ks)-np.min(test_ks))/8)

0.851264940686 0.552949281894
0.809640006571 0.510001642703
0.811647289066 0.52492858787


In [7]:
params = {
    'booster': 'gbtree',
    'objective': 'binary:logitraw',
    'eval_metric': 'auc',
    
    'eta': 0.1,# learning rate 0.2 --
    
    'scale_pos_weight': 34, #2 --
    
    'max_depth': 2,# --
    'min_child_weight': 1.5, #1.6 ??
    'gamma': 0.5, #越大算法越保守 ??
    'subsample': 0.5, #每棵树随机采样比例 --
    'colsample_bytree': 0.9, #每棵树随机采样的列数的占比 2 --
    
    'max_delta_step': 1, # 每棵树权重改变的最大步长 可用于类别不平衡 logisticregresion --

    'alpha': 0.9, #l1正则化参数 --
    'lambda': 0.6, #l2正则化参数 --

    'silent': 0,
}
num_boost_round=84

In [24]:
# data = pd.read_csv('./cmh_level/level_6_median.csv')
# params = {
#     'booster': 'gbtree',
#     'objective': 'binary:logistic',
#     'eval_metric': 'auc',
    
#     'eta': 0.1,# learning rate
    
#     'scale_pos_weight': 100,
    
#     'max_depth': 2,
#     'min_child_weight': 1.5,
#     'gamma': 0.5, #越大算法越保守
#     'subsample': 0.5, #每棵树随机采样比例
#     'colsample_bytree': 0.8, #每棵树随机采样的列数的占比
    
#     'max_delta_step': 1, # 每棵树权重改变的最大步长 可用于类别不平衡 logisticregresion

#     'alpha': 0.8, #l1正则化参数
#     'lambda': 0.6, #l2正则化参数

#     'silent': 0,
# }
# train_auc, train_ks = [], []
# val_auc, val_ks = [], []
# test_auc, test_ks = [], []

# i = 0

# for j in range(20):
#     print('j =', j)
#     X_train, X_val, X_test, y_train, y_val, y_test = utils.train_val_test_split(data, train_size, val_size, test_size, 
#                                                                             random_state=None, time_factors=False)
#     min_max_scaler = MinMaxScaler()
#     min_max_scaler.fit(X_train)
#     X_train = min_max_scaler.transform(X_train)
#     X_val = min_max_scaler.transform(X_val)
#     X_test = min_max_scaler.transform(X_test)

#     dtrain = xgb.DMatrix(X_train, label=y_train) # xgboost data style
#     dval = xgb.DMatrix(X_val, label=y_val)
#     dtest = xgb.DMatrix(X_test, label=y_test)
    
#     xg_model = xgb.train(params, dtrain, num_boost_round=50)
    
#     y_train_pred_prob = xg_model.predict(dtrain)
#     y_val_pred_prob = xg_model.predict(dval)
#     y_test_pred_prob = xg_model.predict(dtest)
#     temp_auc, temp_ks = utils.model_key_performance(y_train_pred_prob, y_train)
#     train_auc.append(temp_auc)
#     train_ks.append(temp_ks)
#     temp_auc, temp_ks = utils.model_key_performance(y_val_pred_prob, y_val)
#     val_auc.append(temp_auc)
#     val_ks.append(temp_ks)
#     temp_auc, temp_ks = utils.model_key_performance(y_test_pred_prob, y_test)
#     test_auc.append(temp_auc)
#     test_ks.append(temp_ks)

# print(np.mean(train_auc), np.mean(train_ks))
# print(np.mean(val_auc), np.mean(val_ks))
# print(np.mean(test_auc), np.mean(test_ks))

j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
j = 6
j = 7
j = 8
j = 9
j = 10
j = 11
j = 12
j = 13
j = 14
j = 15
j = 16
j = 17
j = 18
j = 19
0.83384344715 0.520424270436
0.793066585199 0.471246688254
0.793445098299 0.474210050747
