In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np
from scipy.sparse import hstack
from sklearn.preprocessing.data import OneHotEncoder

In [2]:
df_train = pd.read_csv('./atec_anti_fraud_train.csv')
df_test = pd.read_csv('./atec_anti_fraud_test_a.csv')

In [3]:
# output label distribution.
print(df_train.label.value_counts())

 0    977884
 1     12122
-1      4725
Name: label, dtype: int64


In [4]:
## filter label == -1 and construct train set and validation set.
new_df_train = df_train.loc[df_train.label != -1]
X_all = new_df_train.loc[:, 'f1':'f290'].values
y_all = new_df_train.loc[:, 'label'].values
X_train, X_validation, y_train, y_validation = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

In [7]:
print("X_train shaple: %s, y_train shape: %s" %(X_train.shape, y_train.shape))
print("X_valid shaple: %s, y_valid shape: %s" %(X_validation.shape, y_validation.shape))

X_train shaple: (693004, 290), y_train shape: (693004,)
X_valid shaple: (297002, 290), y_valid shape: (297002,)


In [8]:
gbdt = xgb.XGBClassifier(nthread=24, learning_rate=0.08, n_estimators=50,
                         max_depth=5, gamma=0, subsample=0.9, colsample_bytree=0.5,
                         scale_pos_weight=(y_train == 0).sum()/(y_train == 1).sum())
gbdt.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.08, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=50, nthread=24,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=81, seed=0, silent=True, subsample=0.9)

In [9]:
y_pred_validation = gbdt.predict_proba(X_validation)[:, 1]
gbdt_validation_auc = roc_auc_score(y_validation, y_pred_validation)
print('xgboost validation auc: %.5f' % gbdt_validation_auc)

xgboost validation auc: 0.98396


In [13]:
X_train_leaves = gbdt.apply(X_train)
X_validation_leaves = gbdt.apply(X_validation)

All_leaves = np.concatenate((X_train_leaves, X_validation_leaves), axis=0)
All_leaves = All_leaves.astype(np.int32)

xgbenc = OneHotEncoder()
X_trans = xgbenc.fit_transform(All_leaves)

(train_rows, cols) = X_train_leaves.shape
X_train_trans = X_trans[:train_rows, :]
X_train_valid = X_trans[train_rows:, :]
print("X_train shaple: %s, y_train shape: %s" %(X_train_trans.shape, y_train.shape))
print("X_valid shaple: %s, y_valid shape: %s" %(X_train_valid.shape, y_validation.shape))

X_train shaple: (693004, 1493), y_train shape: (693004,)
X_valid shaple: (297002, 1493), y_valid shape: (297002,)


In [14]:
gbdtenc_lr = LogisticRegression(class_weight='balanced')
gbdtenc_lr.fit(X_trans[:train_rows, :], y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [15]:
y_pred_val_gbdtenc_lr = gbdtenc_lr.predict_proba(X_trans[train_rows:, :])[:, 1]
gbdtenc_lr_auc = roc_auc_score(y_validation, y_pred_val_gbdtenc_lr)
print('gbdt encoder lr validation auc: %.5f' % gbdtenc_lr_auc)

gbdt encoder lr validation auc: 0.98204


In [None]:
## use all dataset to train model and predict test set and output results.
online_gbdt = xgb.XGBClassifier(nthread=24, learning_rate=0.08, n_estimators=50,
                         max_depth=5, gamma=0, subsample=0.9, colsample_bytree=0.5,
                         scale_pos_weight=(y_all == 0).sum()/(y_all == 1).sum())
online_gbdt.fit(X_all, y_all)

X_all_leaves = online_gbdt.apply(X_all)
X_all_leaves = X_all_leaves.astype(np.int32)

online_xgbenc = OneHotEncoder()
X_all_trans = online_xgbenc.fit_transform(X_all_leaves)

online_gbdtenc_lr = LogisticRegression(class_weight='balanced')
online_gbdtenc_lr.fit(X_all_trans, y_all)

X_test_leaves = online_gbdt.apply(df_test.loc[:, 'f1':'f290'].values)
X_test_leaves = X_test_leaves.astype(np.int32)
X_test_trans = online_xgbenc.transform(X_test_leaves)

y_pred_test_gbdtenc_lr = online_gbdtenc_lr.predict_proba(X_test_trans)[:, 1]

output_df = dp.DataFrame()
output_df['id'] = df_test.id
output_df['score'] = y_pred_test_gbdtenc_lr
output_df.to_csv('output.csv')