In [None]:
#划分类别和连续变量
categorical_features = [
    'ProductCD',
    'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2',
    'P_emaildomain',
    'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'
]

continuous_features = list(filter(lambda x: x not in categorical_features, X))

In [None]:
#对于偏度大于1的变量进行log化，再去中心化
class ContinuousFeatureConverter:
    def __init__(self, name, feature, log_transform):
        self.name = name
        self.skew = feature.skew()
        self.log_transform = log_transform
        
    def transform(self, feature):
        if self.skew > 1:
            feature = self.log_transform(feature)
        
        mean = feature.mean()
        std = feature.std()
        return (feature - mean)/(std + 1e-6)   

In [None]:
from tqdm.autonotebook import tqdm

feature_converters = {}
continuous_features_processed = []
continuous_features_processed_test = []

for f in tqdm(continuous_features):
    feature = X[f]#这里修改trian和test
    feature_test = X_test[f]
    log = lambda x: np.log10(x + 1 - min(0, x.min()))
    converter = ContinuousFeatureConverter(f, feature, log)
    feature_converters[f] = converter
    continuous_features_processed.append(converter.transform(feature))
    continuous_features_processed_test.append(converter.transform(feature_test))
    
continuous_train = pd.DataFrame({s.name: s for s in continuous_features_processed}).astype(np.float32)
continuous_test = pd.DataFrame({s.name: s for s in continuous_features_processed_test}).astype(np.float32)

In [None]:
continuous_train['isna_sum'] = continuous_train.isna().sum(axis=1)
continuous_test['isna_sum'] = continuous_test.isna().sum(axis=1)

continuous_train['isna_sum'] = (continuous_train['isna_sum'] - continuous_train['isna_sum'].mean())/continuous_train['isna_sum'].std()
continuous_test['isna_sum'] = (continuous_test['isna_sum'] - continuous_test['isna_sum'].mean())/continuous_test['isna_sum'].std()

In [None]:
isna_columns = []
for column in tqdm(continuous_features):
    isna = continuous_train[column].isna()
    if isna.mean() > 0.:
        continuous_train[column + '_isna'] = isna.astype(int)
        continuous_test[column + '_isna'] = continuous_test[column].isna().astype(int)
        isna_columns.append(column)
        
continuous_train = continuous_train.fillna(continuous_train.median())
continuous_test = continuous_test.fillna(continuous_test.median())

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tqdm.autonotebook import tqdm

def categorical_encode(df_train, df_test, categorical_features, n_values=140):
    df_train = df_train[categorical_features].astype(str)
    df_test = df_test[categorical_features].astype(str)
    
    categories = []
    for column in tqdm(categorical_features):
        categories.append(list(df_train[column].value_counts().iloc[: n_values - 1].index) + ['Other'])
        values2use = categories[-1]
        df_train[column] = df_train[column].apply(lambda x: x if x in values2use else 'Other')
        df_test[column] = df_test[column].apply(lambda x: x if x in values2use else 'Other')
        
    
    ohe = OneHotEncoder(categories=categories)
    ohe.fit(pd.concat([df_train, df_test]))
    df_train = pd.DataFrame(ohe.transform(df_train).toarray()).astype(np.float16)
    df_test = pd.DataFrame(ohe.transform(df_test).toarray()).astype(np.float16)
    return df_train, df_test

In [None]:
for feat in categorical_features:
    print(X[feat].nunique())

In [None]:
train_categorical, test_categorical = categorical_encode(X, X_test, categorical_features)

In [None]:
#作为分隔数值和类别变量的界限，非常重要
num_shape = continuous_train.shape[1]
cat_shape = train_categorical.shape[1]

In [None]:
X = pd.concat([continuous_train, train_categorical], axis=1)
del continuous_train, train_categorical
X_test = pd.concat([continuous_test, test_categorical], axis=1)
del continuous_test, test_categorical

In [None]:
K.clear_session()
from keras.optimizers import Adam
from keras import regularizers
from keras.regularizers import l2 
def create_model():
    num_inp = Input(shape=(num_shape,))
    cat_inp = Input(shape=(cat_shape,))
    inps = concatenate([num_inp, cat_inp])
    x = Dense(128, activation="selu",\
                kernel_initializer='lecun_normal')(inps)
    x = Dense(64, activation="relu")(x)
    x = Dense(32, activation=custom_gelu)(x)
    x = Dense(32, activation=custom_gelu)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(128, activation='selu',kernel_initializer='lecun_normal')(x)
    #x = Dropout(.2)(x)
    cat_out = Dense(cat_shape, activation = "linear")(x)
    num_out = Dense(num_shape, activation = "linear")(x)
    model = Model(inputs=[num_inp, cat_inp], outputs=[num_out, cat_out])
    model.compile(
        optimizer=Adam(.05, clipnorm = 1, clipvalue = 1),
        loss=["mse", "mse"]
    )
      

    return model

In [None]:
def inputSwapNoise(arr, p):
    n, m = arr.shape
    idx = range(n)
    swap_n = round(n*p)
    for i in range(m):
        col_vals = np.random.permutation(arr[:, i]) # change the order of the row
        swap_idx = np.random.choice(idx, size= swap_n) # choose row
        arr[swap_idx, i] = np.random.choice(col_vals, size = swap_n) # n*p row and change it 
    return arr

In [None]:
def auto_generator(X, swap_rate, batch_size):
    indexes = np.arange(X.shape[0])
    while True:
        np.random.shuffle(indexes)
        num_X = X[indexes[:batch_size], :num_shape] 
        num_y = inputSwapNoise(num_X, swap_rate)
        cat_X = X[indexes[:batch_size], num_shape:] 
        cat_y = inputSwapNoise(cat_X, swap_rate)
        yield [num_y, cat_y], [num_X, cat_X]

In [None]:
batch_size = 2048#128

In [None]:
train_gen = auto_generator(X.values, .25, batch_size)

In [None]:
hist = model_mse.fit_generator(train_gen, steps_per_epoch=len(X)//batch_size, epochs=epochs,
                           verbose=1, workers=-1, 
                           use_multiprocessing=True,
                              callbacks=[auto_ckpt, warm_up_lr])

In [None]:
#这里也是
fraud_model.fit([X_tr.iloc[:, :num_shape], X_tr.iloc[:, num_shape:]], y_tr, epochs=100,
                batch_size=2048, 
                validation_data = ([X_val.iloc[:, :num_shape], X_val.iloc[:, num_shape:]], y_val),
               callbacks=[ckpt], verbose = 2)