In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, log_loss
from skopt import BayesSearchCV
from skopt.space import Integer, Real
from sklearn.model_selection import StratifiedKFold, train_test_split
from MLFeatureSelection import FeatureSelection as FS
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
feature_columns = ['f%s' % i for i in range(1, 298)]
dtype = {}
for i in feature_columns:
    if i not in ['f5', 'f82', 'f83', 'f84', 'f85', 'f86']:
        dtype[i] = 'int16'
dtype.update({'f5':'int32', 'f82':'float32', 'f83':'float32', 
              'f84':'float32', 'f85':'float32', 'f86':'float32', 'id':'str', 'date':'int16'})

In [3]:
train = pd.read_csv('../input/anti-preprocess/train.csv.gz', low_memory=True, dtype=dtype, compression='gzip')
train['label'] = train['label'].replace([-1], [1])
test = pd.read_csv('../input/anti-preprocess/test.csv.gz', low_memory=True, dtype=dtype, compression='gzip')

In [4]:
grouped = pd.DataFrame((train==-1).sum()).reset_index()
grouped.groupby(0).agg(lambda x:', '.join(x))

In [7]:
pos = train[train['label'] == 1]
neg = train[train['label'] == 0]
SAMPLE_NUM = pos.shape[0]
balance = pd.concat([pos, neg.sample(SAMPLE_NUM, random_state=10)])

balance.replace([-1], [0], inplace=True)
test.replace([-1], [0], inplace=True)

balance.sort_index(inplace=True)
balance.reset_index(drop=True, inplace=True)

In [2]:
def score(pred, real): #评分系统，感谢herhert，返回s2
    return log_loss(pred, real)

df = balance

def validation(X, Y, features, clf, lossfunction):
    totaltest = []
    kf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
    for train_index, test_index in kf.split(X, Y):
        X_train, X_test = X.ix[train_index,:][features], X.ix[test_index,:][features]
        y_train, y_test = Y[train_index], Y[test_index]
        #clf.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='logloss', verbose=False,early_stopping_rounds=50)
        clf.fit(X_train, y_train)
        totaltest.append(lossfunction(y_test, clf.predict(X_test)))
    return np.mean(totaltest)

def add(x,y):
    return x + y

def substract(x,y):
    return x - y

def times(x,y):
    return x * y

def divide(x,y):
    return (x + 0.001)/(y + 0.001)

CrossMethod = {'+':add,
               '-':substract,
               '*':times,
               '/':divide,}

sf = FS.Select(Sequence = True, Random = False, Cross = False) #初始化选择器，选择你需要的流程
sf.ImportDF(df, label ='label') #导入数据集以及目标标签
#sf.ImportCrossMethod(CrossMethod)
sf.ImportLossFunction(score, direction = 'descend') #导入评价函数以及优化方向
sf.InitialNonTrainableFeatures(['id','date', 'label']) #初始化不能用的特征
sf.InitialFeatures(feature_columns) #初始化其实特征组合
sf.GenerateCol() #生成特征库 （具体该函数变量请参考根目录下的readme）
sf.SetSample(1, samplemode = 1) #初始化抽样比例和随机过程
sf.SetTimeLimit(240) #设置算法运行最长时间，以分钟为单位
sf.clf = lgb.LGBMClassifier(random_state=1, num_leaves =6, n_estimators=200, max_depth=3, learning_rate = 0.1, n_jobs=4) #设定回归模型
sf.SetLogFile('record.log') #初始化日志文件
sf.run(validation) #输入检验函数并开始运行

search_params = {
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'metric': ['auc'],
    'max_depth': Integer(2, 10),
    'num_leaves': Integer(4, 60),
    'learning_rate': Real(0.001, 0.08),
    'feature_fraction': Real(0.1, 1.),         
    'bagging_fraction': Real(0.1, 1),
    'bagging_freq': Integer(2, 20),
    'num_threads': [-1],
    'num_iterations': Integer(40, 300),
    'min_child_samples': Integer(5, 100),#adding params, default:20
    'min_child_weight': Real(.000001, 0.1,),#adding params, default:1e-3
    'max_bin':Integer(50, 200),
    #'scale_pos_weight':Integer(1, 100),
    
}
bayes_cv_tuner = BayesSearchCV(
    estimator = lgb.LGBMClassifier(objective='binary', metric='auc', n_jobs=-1, verbose=0),
    search_spaces = search_params,    
    scoring = 'roc_auc',
    cv =StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs = 1,
    n_iter = 20,   
    verbose = 3,
    refit = False,
    random_state = 50)
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest auc: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 5),
        bayes_cv_tuner.best_params_
    ))
# Fit the model
result = bayes_cv_tuner.fit(X, Y, callback=status_print)

Model #11

Best auc: 0.98071

Best params: {'bagging_fraction': 0.46688427510068276, 'bagging_freq': 3, 'boosting_type': 'gbdt', 'feature_fraction': 0.76121943556429772, 'learning_rate': 0.066139890726866046, 'max_bin': 153, 'max_depth': 9, 'metric': 'auc', 'min_child_samples': 24, 'min_child_weight': 0.047370946628263792, 'num_iterations': 221, 'num_leaves': 22, 'num_threads': -1, 'objective': 'binary'}


params_op = {'bagging_fraction': 0.46688427510068276, 
                 'bagging_freq': 3, 
                 'boosting_type': 'gbdt', 
                 'feature_fraction': 0.76121943556429772, 
                 'learning_rate': 0.066139890726866046, 
                 'max_bin': 153, 
                 'max_depth': 9, 
                 'metric': 'auc', 
                 'min_child_samples': 24, 
                 'min_child_weight': 0.047370946628263792, 
                 'num_iterations': 140, 
                 'num_leaves': 22, 
                 'num_threads': -1, 
                 'objective': 'binary'}
dtrain = lgb.Dataset(X, Y)
result = lgb.cv(params_op, dtrain, metrics='auc', early_stopping_rounds=5, verbose_eval=5)

result2 = lgb.cv(params_op, dtrain, num_boost_round=400, metrics='binary', early_stopping_rounds=5, verbose_eval=5)

In [41]:
NFOLDS = 5
kf = StratifiedKFold(n_splits=NFOLDS, random_state=10, shuffle=True)

In [66]:
def get_oof():
    oof_train = np.zeros((X.shape[0],))
    oof_test = np.zeros((X_test.shape[0],))
    oof_test_skf = np.empty((NFOLDS, X_test.shape[0]))
    params_op =  {'bagging_fraction': 0.46688427510068276, 
                  'bagging_freq': 3, 
                  'boosting_type': 'gbdt', 
                  'feature_fraction': 0.76121943556429772, 
                  'learning_rate': 0.066139890726866046, 
                  'max_bin': 153, 
                  'max_depth': 9, 
                  'metric': 'auc', 
                  'min_child_samples': 24, 
                  'min_child_weight': 0.047370946628263792, 
                  'num_iterations': 221, 
                  'num_leaves': 40, 
                  'num_threads': -1, 
                  'objective': 'binary'}

    for i, (train_index, test_index) in enumerate(kf.split(X, Y)):
        x_tr = X.values[train_index]
        y_tr = Y.values[train_index]
        x_te = X.values[test_index]
        
        dtrain = lgb.Dataset(x_tr, y_tr)
        model_lgb = lgb.train(params_op, dtrain)

        oof_train[test_index] = model_lgb.predict(x_te)
        oof_test_skf[i, :] = model_lgb.predict(X_test.values)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [67]:
def scorer(y, pred):
    fpr, tpr, thresholds = roc_curve(y, pred, pos_label=1)
    score = 0.4 * tpr[np.where(fpr>=0.001)[0][0]] + \
            0.3 * tpr[np.where(fpr>=0.005)[0][0]] + \
            0.3 * tpr[np.where(fpr>=0.01)[0][0]]
    print('-----------------------------result------------------------')
    print('fpr_0.001: {0} | fpr_0.005: {1} | fpr_0.01: {2}'.format(tpr[np.where(fpr>=0.001)[0][0]], 
                                   tpr[np.where(fpr>=0.005)[0][0]], 
                                   tpr[np.where(fpr>=0.01)[0][0]]))
    print('score : {}'.format(score))
    return score

oof_train, oof_test = get_oof() 
#oof['oof_train'] = oof_train
submission['score'] = oof_test
submission.to_csv('lgb_201805261953.csv', index=False)
#oof.to_csv('oof_all.csv', index=False)

scorer(Y, oof_train)