In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import gc
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_columns', 300)
from sklearn.metrics import auc, roc_curve
# Any results you write to the current directory are saved as output.

In [2]:
feature_columns = ['f%s' % i for i in range(1, 298)]
dtype = {}
for i in feature_columns:
    if i not in ['f5', 'f82', 'f83', 'f84', 'f85', 'f86']:
        dtype[i] = 'float16'
#dtype.update({'f5':'int32', 'f82':'float32', 'f83':'float32', 
#              'f84':'float32', 'f85':'float32', 'f86':'float32', 'id':'str', 'date':'int16'})

In [3]:
train = pd.read_csv('../input/atec-anti-fraud/atec_anti_fraud_train.csv')
train['label'] = train['label'].replace([-1], [1])
test = pd.read_csv('../input/atec-anti-fraud/atec_anti_fraud_test_a.csv')
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [4]:
TR = train[train['date']<20171100]
VA = train[train['date']>20171100]
TR.reset_index(drop=True, inplace=True)

In [5]:
X = TR.drop(['id', 'label', 'date'], axis=1)
Y = TR['label']

X_test = test.drop(['id', 'date'], axis=1)
#Y_test = TS['label']
X_valid = VA.drop(['id', 'date', 'label'], axis=1)
Y_valid = VA['label']
submission = test[['id']]

In [6]:
del train, test, VA
gc.collect()

In [7]:
def scorer(y, pred):
    fpr, tpr, thresholds = roc_curve(y, pred, pos_label=1)
    score = 0.4 * tpr[np.where(fpr>=0.001)[0][0]] + \
            0.3 * tpr[np.where(fpr>=0.005)[0][0]] + \
            0.3 * tpr[np.where(fpr>=0.01)[0][0]]
    print('-----------------------------result------------------------')
    print('fpr_0.001: {0} | fpr_0.005: {1} | fpr_0.01: {2}'.format(tpr[np.where(fpr>=0.001)[0][0]], 
                                   tpr[np.where(fpr>=0.005)[0][0]], 
                                   tpr[np.where(fpr>=0.01)[0][0]]))
    print('score : {}'.format(score))
    return score

In [8]:
NFOLDS = 5
kf = StratifiedKFold(n_splits=NFOLDS, random_state=10, shuffle=True)
def get_oof():
    oof_test = np.zeros((X_test.shape[0],))
    oof_test_skf = np.empty((NFOLDS, X_test.shape[0]))

    for i, (train_index, test_index) in enumerate(kf.split(TR, Y)):
        train_temp = TR.iloc[train_index]
        pos = train_temp[train_temp['label'] == 1]
        train_temp = pd.concat([train_temp, pos], ignore_index=True)
        x_tr = train_temp.drop(['id', 'date', 'label'], axis=1).values
        y_tr = train_temp['label'].values
        #x_tr = X.values[train_index]
        #y_tr = Y.values[train_index]
        x_te = X.values[test_index]
        y_te = Y.values[test_index]
        model = xgb.XGBClassifier(max_depth=7, 
                                  learning_rate=0.07,
                                  n_estimators=200, #928
                                  silent=True, 
                                  objective='binary:logistic', 
                                  booster='gbtree', 
                                  n_jobs=-1, 
                                  gamma=3.8111289765374132e-05, 
                                  min_child_weight=300, #22
                                  max_delta_step=4, 
                                  subsample=0.8, #0.65
                                  colsample_bytree=0.7,#0.5
                                  colsample_bylevel=0.8, 
                                  scale_pos_weight=1, 
                                  random_state=10, 
                                  eval_metric ='auc',
                                  tree_method='auto')

        model.fit(x_tr, y_tr, eval_set=[(x_te, y_te)], eval_metric='auc', early_stopping_rounds=50)
        oof_test_skf[i, :] = model.predict_proba(X_test.values)[:, 1]
        pred = model.predict_proba(X_valid.values)[:, 1]
        scorer(Y_valid, pred)
        del pred, model
        gc.collect()
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_test.reshape(-1, 1)

In [None]:
oof_test = get_oof() 
submission['score'] = oof_test
submission.to_csv('xgb_201806071421.csv', index=False)

# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 4,
        objective = 'binary:logistic',
        eval_metric = 'auc',
        silent=1,
        tree_method='approx',
    ),
    search_spaces = {
        'learning_rate': (0.01, 0.08),
        'min_child_weight': (10, 50),
        'max_depth': (2, 10),
        'max_delta_step': (1, 10),
        'subsample': (0.5, 0.9, 'uniform'),
        'colsample_bytree': (0.5, 0.9, 'uniform'),
        'colsample_bylevel': (0.1, 1.0, 'uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'n_estimators': (50, 320),
        'scale_pos_weight': [5],
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 1,
    n_iter = 6,   
    verbose = 3,
    refit = False,
    random_state = 42
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(len(all_models), np.round(bayes_cv_tuner.best_score_, 5), bayes_cv_tuner.best_params_))

# Fit the model
result = bayes_cv_tuner.fit(X, Y, callback=status_print)

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
def get_oof():
    oof_train = np.zeros((X.shape[0],))
    oof_test = np.zeros((X_test.shape[0],))
    oof_test_skf = np.empty((5, X_test.shape[0]))

    for i, (train_index, test_index) in enumerate(kf.split(X, Y)):
        x_tr = X.values[train_index]
        y_tr = Y.values[train_index]
        x_te = X.values[test_index]
        
        model = xgb.XGBClassifier(max_depth=8, 
                                  learning_rate=0.060099928347492317,
                                  n_estimators=226, #928
                                  silent=True, 
                                  objective='binary:logistic', 
                                  booster='gbtree', 
                                  n_jobs=-1, 
                                  gamma=3.8111289765374132e-05, 
                                  min_child_weight=22, #22
                                  max_delta_step=9, 
                                  subsample=0.65, #0.65
                                  colsample_bytree=0.5,#0.5
                                  colsample_bylevel=0.81959809744645584, 
                                  scale_pos_weight=5, 
                                  random_state=10, 
                                  eval_metric = 'auc',
                                  tree_method='approx')

        model.fit(x_tr, y_tr)
        oof_train[test_index] = model.predict_proba(x_te)[:, 1]
        oof_test_skf[i, :] = model.predict_proba(X_test.values)[:, 1]

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

oof_train, oof_test = get_oof() 

scorer(Y, oof_train)#0.65 subsample

#oof['oof_train'] = oof_train
submission['score'] = oof_test
submission.to_csv('xgb_201805281839.csv', index=False)
#oof.to_csv('oof_all.csv', index=False)


#0.9835
fpr, tpr, _ = roc_curve(Y, oof_train)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10, 6))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 0.02])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()