In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from keras.layers import Dense, Dropout, Embedding, Flatten, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
import lightgbm as lgb
from scipy.stats import pearsonr
import gc
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

['anti-preprocess', 'atec-anti-fraud']


In [3]:
feature_columns = ['f%s' % i for i in range(1, 298)]
dtype = {}
for i in feature_columns:
    if i not in ['f5', 'f82', 'f83', 'f84', 'f85', 'f86']:
        dtype[i] = 'int16'
dtype.update({'f5':'int32', 'f82':'float32', 'f83':'float32', 
              'f84':'float32', 'f85':'float32', 'f86':'float32', 'id':'str', 'date':'int16'})

In [4]:
numerical_feature = ['f5', 'f7', 'f18', 'f25', 'f26', 'f28', 'f29', 'f30', 'f32', 'f50', 'f75', 'f82', 'f83', 'f84', 'f85', 'f86', 
                    'f90', 'f95', 'f101', 'f106', 'f112', 'f118', 'f119', 'f135', 'f136', 'f142', 'f143', 'f148', 'f149',
                    'f150', 'f179', 'f210', 'f238', 'f243', 'f244', 'f258', 'f287', 'f294', 'f295']
cat_feature = list(set(feature_columns).difference(set(numerical_feature)))
#feature_idx = [i for i in range(len(cat_feature))]

In [None]:
train = pd.read_csv('../input/atec-anti-fraud/atec_anti_fraud_train.csv')
train['label'] = train['label'].replace([-1], [1])
test = pd.read_csv('../input/atec-anti-fraud/atec_anti_fraud_test_a.csv')
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
for col in feature_columns:
    if col in ['f82', 'f83', 'f84', 'f85', 'f86']:continue
    train[col] = train[col].apply(int)
    test[col] = test[col].apply(int)

In [None]:
X_cat = train[cat_feature]
X = train[numerical_feature]
X_test_cat = test[cat_feature]
X_test = test[numerical_feature]
Y = train['label']
submission = test[['id']]

In [None]:
del train, test
gc.collect()

In [None]:
x_test_cat = []
for i in range(X_test_cat.shape[1]):
    x_test_cat.append(X_test_cat[:, i].values.reshape(-1, 1))
x_test_cat.append(X_test.values)

In [6]:
max_cat_values = {}
for e, col in enumerate(cat_feature):
    #int(e, col, int(max(train[col].max(), test[col].max())))
    max_cat_values[col] = int(max(train[col].max(), test[col].max()))+1

In [None]:
def scorer(y, pred):
    fpr, tpr, thresholds = roc_curve(y, pred, pos_label=1)
    score = 0.4 * tpr[np.where(fpr>=0.001)[0][0]] + \
            0.3 * tpr[np.where(fpr>=0.005)[0][0]] + \
            0.3 * tpr[np.where(fpr>=0.01)[0][0]]
    print('-----------------------------result------------------------')
    print('fpr_0.001: {0} | fpr_0.005: {1} | fpr_0.01: {2}'.format(tpr[np.where(fpr>=0.001)[0][0]], 
                                   tpr[np.where(fpr>=0.005)[0][0]], 
                                   tpr[np.where(fpr>=0.01)[0][0]]))
    print('score : {}'.format(score))
    return score

In [7]:
def nn_model():
    inputs = []
    flatten_layers = []
    for e, col in enumerate(cat_feature):
        input_c = Input(shape=(1, ), dtype='int32')
        num_c = max_cat_values[e]
        embed_c = Embedding(
            num_c,
            6,
            input_length=1
        )(input_c)
        embed_c = Dropout(0.25)(embed_c)
        flatten_c = Flatten()(embed_c)

        inputs.append(input_c)
        flatten_layers.append(flatten_c)

    input_num = Input(shape=(X.shape[1],), dtype='float32')
    flatten_layers.append(input_num)
    inputs.append(input_num)

    flatten = merge(flatten_layers, mode='concat')

    fc1 = Dense(512, init='he_normal')(flatten)
    fc1 = PReLU()(fc1)
    fc1 = BatchNormalization()(fc1)
    fc1 = Dropout(0.75)(fc1)

    fc1 = Dense(64, init='he_normal')(fc1)
    fc1 = PReLU()(fc1)
    fc1 = BatchNormalization()(fc1)
    fc1 = Dropout(0.5)(fc1)

    outputs = Dense(1, init='he_normal', activation='sigmoid')(fc1)

    model = Model(input = inputs, output = outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return (model)


for (train_index, test_index) in kfold.split(X, Y):
    xtr = X[train_index].values
    ytr = Y[train_index].values
    xte = X[test_index].values
    yte = Y[test_index].values

    xtr_cat = X_cat[train_index].values
    xte_cat = X_cat[test_index].values

    # get xtr xte cat
    xtr_cat_list, xte_cat_list = [], []
    for i in range(xtr_cat.shape[1]):
        xtr_cat_list.append(xtr_cat[:, i].values.reshape(-1, 1))
        xte_cat_list.append(xte_cat[:, i].values.reshape(-1, 1))

    xtr_cat_list.append(xtr)
    xte_cat_list.append(xte)

    model = nn_model()
    def get_rank(x):
        return pd.Series(x).rank(pct=True).values
    model.fit(xtr_cat_list, ytr, epochs=20, batch_size=512, verbose=2, validation_data=[xte_cat_list, yte])
    cv_train[test_index] += get_rank(model.predict(x=xte_cat_list, batch_size=512, verbose=0)[:, 0])
    print(scorer(Y[test_index], cv_train[test_index]))
    cv_pred += get_rank(model.predict(x=x_test_cat, batch_size=512, verbose=0)[:, 0])

In [None]:
submission['score'] = cv_pred
submission.to_csv('nn_201806061145.csv', index=False)