In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import PredefinedSplit
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import sys
sys.path.append('.')
import utils



In [2]:
def get_ks_score(prob, label):
    '''
    计算ks得分
    :param pro: 属于坏人的概率
    :param label: 真实标签
    :return: ks得分
    '''
    df = pd.DataFrame(data = {'label': label, 'prob': prob})
    df['prob'] = df['prob'].map(lambda x: round(x, 3))
    total = pd.DataFrame({'total': df.groupby('prob')['label'].count()})
    bad = pd.DataFrame({'bad': df.groupby('prob')['label'].sum()})
    all_data = total.merge(bad, how = 'left', left_index = True, right_index = True)
    all_data['good'] = all_data['total'] - all_data['bad']
    all_data.reset_index(inplace = True)
    all_data['goodCumPer'] = all_data['good'].cumsum() / all_data['good'].sum()
    all_data['badCumPer'] = all_data['bad'].cumsum() / all_data['bad'].sum()
    KS_m = all_data.apply(lambda x: x.goodCumPer - x.badCumPer, axis = 1)
    return max(KS_m)

def my_scoring(self, X, y, sample_weight=None):
    scoring_prob = self.predict_proba(X)[:, 1]
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y, scoring_prob)
    auc = metrics.auc(false_positive_rate, true_positive_rate)
    ks = get_ks_score(scoring_prob, y)
    #print(X.shape)
    #score_need = 2*auc*ks/(auc + ks)
    score_need = (auc + ks)
    return ks

In [3]:
data = pd.read_csv('./cmh_level/level_6.csv')
data.values.shape

# X_train = pd.read_csv('./time_split/train_data_6_10_feature_selected_median_level_6.csv')
# X_val = pd.read_csv('./time_split/val_data_11_feature_selected_median_level_6.csv')
# X_test = pd.read_csv('./time_split/test_data_feature_selected_12_median_level_6.csv')
# y_train = X_train.pop('label').as_matrix()
# y_val = X_val.pop('label').as_matrix()
# y_test = X_test.pop('label').as_matrix()
# X_train = X_train.as_matrix()
# X_val = X_val.as_matrix()
# X_test = X_test.as_matrix()

# X_train = pd.read_csv('./cmh_level/level6_onehot/X_train.csv').as_matrix()
# X_val = pd.read_csv('./cmh_level/level6_onehot/X_val.csv').as_matrix()
# X_test = pd.read_csv('./cmh_level/level6_onehot/X_test.csv').as_matrix()
# y_train = pd.read_csv('./cmh_level/level6_onehot/y_train.csv').as_matrix().T[0]
# y_val = pd.read_csv('./cmh_level/level6_onehot/y_val.csv').as_matrix().T[0]
# y_test = pd.read_csv('./cmh_level/level6_onehot/y_test.csv').as_matrix().T[0]

(79054, 123)

In [4]:
train_size = 0.74
val_size = 0.13
test_size = 0.13

X_train, X_val, X_test, y_train, y_val, y_test = utils.train_val_test_split(data, train_size, val_size, test_size, 
                                                                            random_state=0, time_factors=False)

min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train)
X_train = min_max_scaler.transform(X_train)
X_val = min_max_scaler.transform(X_val)
X_test = min_max_scaler.transform(X_test)

print(Counter(y_train), Counter(y_val), Counter(y_test))

Counter({0: 58008, 1: 491}) Counter({0: 10191, 1: 86}) Counter({0: 10192, 1: 86})


In [5]:
# X_train = pd.DataFrame(X_train)
# X_test = pd.DataFrame(X_test)
# X_val = pd.DataFrame(X_val)
# y_train = pd.DataFrame(y_train)
# y_test = pd.DataFrame(y_test)
# y_val = pd.DataFrame(y_val)

# X_train.to_csv('X_train.csv', index = False)
# X_test.to_csv('X_test.csv', index = False)
# X_val.to_csv('X_val.csv', index = False)
# y_train.to_csv('y_train.csv', index = False)
# y_test.to_csv('y_test.csv', index = False)
# y_val.to_csv('y_val.csv', index = False)

In [6]:
#原始逻辑回归baseline
lr = LogisticRegression(C=0.1, class_weight={1:30})
lr.fit(X_train, y_train)
y_train_pred_prob = lr.predict_proba(X_train)[:, 1]
y_val_pred_prob = lr.predict_proba(X_val)[:, 1]
y_test_pred_prob = lr.predict_proba(X_test)[:, 1]
utils.model_key_performance(y_train_pred_prob, y_train)
utils.model_key_performance(y_val_pred_prob, y_val)
utils.model_key_performance(y_test_pred_prob, y_test)

(0.7999599549121974, 0.50095378043883021)

In [7]:
#原始xgboost baseline 
#level_6的最佳参数
# dtrain = xgb.DMatrix(X_train, label=y_train) # xgboost data style
# dval = xgb.DMatrix(X_val, label=y_val)
# dtest = xgb.DMatrix(X_test, label=y_test)
# params = {
#     'booster': 'gbtree',
#     'objective': 'binary:logitraw',
#     'eval_metric': 'auc',
#     'max_depth': 2,
#     'max_delta_step': 3,
#     'lambda': 0.8,
#     'subsample': 0.4,
#     'colsample_bytree': 1,
#     'min_child_weight': 1.5,
#     'eta': 0.2,
#     'silent': 0,
#     'scale_pos_weight': 2,
#     'gamma': 2
# }
# # watchlist = [(dtrain, 'train'), (dval, 'val'), (dtest, 'test')]
# # train_val_features = np.concatenate((X_train, X_val), axis = 0)
# # train_val_labels = np.concatenate((y_train, y_val), axis = 0)
# # test_fold = np.zeros(train_val_features.shape[0])
# # test_fold[:X_train_need[i].shape[0]] = -1
# # ps = PredefinedSplit(test_fold = test_fold)

# # xg_model = GridSearchCV(estimator=LogisticRegression(), param_grid=params, 
# #                      scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0)
# # model.fit(train_val_features, train_val_labels)
# xg_model = xgb.train(params, dtrain, num_boost_round=60)

# y_train_pred_prob = xg_model.predict(dtrain)
# y_val_pred_prob = xg_model.predict(dval)
# y_test_pred_prob = xg_model.predict(dtest)
# utils.model_key_performance(y_train_pred_prob, y_train)
# utils.model_key_performance(y_val_pred_prob, y_val)
# utils.model_key_performance(y_test_pred_prob, y_test)

In [8]:
#原始xgboost baseline
#level_6_median的最佳参数
# dtrain = xgb.DMatrix(X_train, label=y_train) # xgboost data style
# dval = xgb.DMatrix(X_val, label=y_val)
# dtest = xgb.DMatrix(X_test, label=y_test)
# params = {
#     'booster': 'gbtree',
#     'objective': 'binary:logitraw',
#     'eval_metric': 'auc',
#     'max_depth': 2,
#     'max_delta_step': 2,
#     'lambda': 0.6,
#     'subsample': 0.5,
#     'colsample_bytree': 0.8,
#     'min_child_weight': 1.6,
#     'eta': 0.2,
#     'silent': 0,
#     'scale_pos_weight': 2,
#     'gamma': 2
# }
# # watchlist = [(dtrain, 'train'), (dval, 'val'), (dtest, 'test')]
# # train_val_features = np.concatenate((X_train, X_val), axis = 0)
# # train_val_labels = np.concatenate((y_train, y_val), axis = 0)
# # test_fold = np.zeros(train_val_features.shape[0])
# # test_fold[:X_train_need[i].shape[0]] = -1
# # ps = PredefinedSplit(test_fold = test_fold)

# # xg_model = GridSearchCV(estimator=LogisticRegression(), param_grid=params, 
# #                      scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0)
# # model.fit(train_val_features, train_val_labels)
# xg_model = xgb.train(params, dtrain, num_boost_round=71)

# y_train_pred_prob = xg_model.predict(dtrain)
# y_val_pred_prob = xg_model.predict(dval)
# y_test_pred_prob = xg_model.predict(dtest)
# utils.model_key_performance(y_train_pred_prob, y_train)
# utils.model_key_performance(y_val_pred_prob, y_val)
# utils.model_key_performance(y_test_pred_prob, y_test)

In [9]:
#集成学习分割数据
X_train_1 = X_train[y_train == 1]
y_train_1 = y_train[y_train == 1]
X_train_0 = X_train[y_train == 0]
y_train_0 = y_train[y_train == 0]

In [10]:
k = 7
step_size = X_train_0.shape[0] // k

X_train_need = []
y_train_need = []
for i in range(k):
    tmp_x = X_train_0[i*step_size:min(X_train_0.shape[0]-1, i*step_size+step_size)]
    tmp_y = y_train_0[i*step_size:min(X_train_0.shape[0]-1, i*step_size+step_size)]
    X_train_need.append(np.concatenate((X_train_1, tmp_x), axis=0))
    y_train_need.append(np.concatenate((y_train_1, tmp_y), axis=0))

In [11]:
C_params = [i/10 for i in range(1, 10)] + [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
cw_params = [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
params = {
    'C': C_params,
    'class_weight': [{1:w} for w in cw_params],
}

In [12]:
# C_params

In [13]:
train_pred_prob_record = []
val_pred_prob_record = []
test_pred_prob_record = []
for i in range(k):
    print('i = ', i, '\n')
    train_val_features = np.concatenate((X_train_need[i], X_val), axis = 0)
    train_val_labels = np.concatenate((y_train_need[i], y_val), axis = 0)
    test_fold = np.zeros(train_val_features.shape[0])
    test_fold[:X_train_need[i].shape[0]] = -1
    ps = PredefinedSplit(test_fold = test_fold)

    model = GridSearchCV(estimator=LogisticRegression(), param_grid=params, 
                         scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0)
    model.fit(train_val_features, train_val_labels)
    print(model.best_params_ )
    print(model.best_score_ )
    train_pr = model.predict_proba(X_train)[:, 1]
    val_pr = model.predict_proba(X_val)[:, 1]
    test_pr = model.predict_proba(X_test)[:, 1]
    
    utils.model_key_performance(train_pr, y_train)
    utils.model_key_performance(val_pr, y_val)
    utils.model_key_performance(test_pr, y_test)
    print('\n')
    
    train_pred_prob_record.append(train_pr)
    val_pred_prob_record.append(val_pr)
    test_pred_prob_record.append(test_pr)

i =  0 

{'class_weight': {1: 10}, 'C': 0.2}
0.478019821411


i =  1 

{'class_weight': {1: 9}, 'C': 0.2}
0.490383671867


i =  2 

{'class_weight': {1: 5}, 'C': 0.3}
0.490972426651


i =  3 

{'class_weight': {1: 10}, 'C': 0.7}
0.470022568933


i =  4 

{'class_weight': {1: 7}, 'C': 0.2}
0.488715533314


i =  5 

{'class_weight': {1: 10}, 'C': 0.2}
0.488273967226


i =  6 

{'class_weight': {1: 60}, 'C': 0.3}
0.483269551565




In [14]:
# lr0 = LogisticRegression(C=1, class_weight={1:50})
# lr0.fit(X_train_need[0], y_train_need[0])
# train_pr = lr0.predict_proba(X_train)[:, 1]
# val_pr = lr0.predict_proba(X_val)[:, 1]
# test_pr = lr0.predict_proba(X_test)[:, 1]
# utils.model_key_performance(train_pr, y_train)
# utils.model_key_performance(val_pr, y_val)
# utils.model_key_performance(test_pr, y_test)
# train_pred_prob_record.append(train_pr)
# val_pred_prob_record.append(val_pr)
# test_pred_prob_record.append(test_pr)

In [15]:
train_ensum = np.array(train_pred_prob_record).T
val_ensum = np.array(val_pred_prob_record).T
test_ensum = np.array(test_pred_prob_record).T

In [16]:
print(train_ensum.shape)
print(y_train.shape)

(58499, 7)
(58499,)


In [17]:
#bagging
bagging_train_prob = np.mean(train_ensum, axis = 1)
bagging_val_prob = np.mean(val_ensum, axis = 1)
bagging_test_prob = np.mean(test_ensum, axis = 1)
#result
utils.model_key_performance(bagging_train_prob, y_train)
utils.model_key_performance(bagging_val_prob, y_val)
utils.model_key_performance(bagging_test_prob, y_test)

(0.80224286718265125, 0.50595770508561211)

In [18]:
C_params = [i/10 for i in range(1, 10)] + [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
cw_params = [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
params = {
    'C': C_params,
    'class_weight': [{1:w} for w in cw_params],
}

train_val_features = np.concatenate((train_ensum, val_ensum), axis = 0)
train_val_labels = np.concatenate((y_train, y_val), axis = 0)
test_fold = np.zeros(train_val_features.shape[0])
test_fold[:train_ensum.shape[0]] = -1
ps = PredefinedSplit(test_fold = test_fold)

In [19]:
print(train_val_features.shape)
print(train_val_labels.shape)

(68776, 7)
(68776,)


In [20]:
#stacking
lr_stack = GridSearchCV(estimator=LogisticRegression(), param_grid=params, 
                         scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0)
lr_stack.fit(train_val_features, train_val_labels)
print(lr_stack.best_params_ )

lr_stack_train_pred_prob = lr_stack.predict_proba(train_ensum)[:, 1]
lr_stack_val__pred_prob = lr_stack.predict_proba(val_ensum)[:, 1]
lr_stack_test_pred_prob = lr_stack.predict_proba(test_ensum)[:, 1]
utils.model_key_performance(lr_stack_train_pred_prob, y_train)
utils.model_key_performance(lr_stack_val__pred_prob, y_val)
utils.model_key_performance(lr_stack_test_pred_prob, y_test)

{'class_weight': {1: 20}, 'C': 0.6}


(0.80921025610602015, 0.50909970428242857)

In [21]:
y_test.sum()

86

In [22]:
lr_stack_test_pred_prob

array([ 0.02478268,  0.08053617,  0.41711721, ...,  0.169334  ,
        0.15065305,  0.02493949])

In [34]:
y_test_label_need = []
for x in lr_stack_test_pred_prob:
    if x < 0.13:
        y_test_label_need.append(0)
    else:
        y_test_label_need.append(1)
confusion_matrix(y_test, y_test_label_need)

array([[6955, 3237],
       [  17,   69]])