In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics

In [2]:
random_state = 42
np.random.seed(random_state)

In [18]:
def gen_fake_norm_dateset(column_size=20, instance_size=100000):
    """
    Input size: total batch size
    Distribution: gen a fake dataset for test, 20 coloumns is normal distributaion.
    """
    dataset = {}
    for i in range(column_size):
        dataset['col_{}'.format(i)] = np.random.normal(0,1,instance_size)
    df = pd.DataFrame(dataset)
    train = df[:instance_size//2]
    test = df[instance_size//2:]
    # add drift to column 0
    test['col_0'] += np.random.normal(0.1,0.5,len(test))
    return train, test

In [19]:
batch1, batch2 = gen_fake_norm_dateset()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['col_0'] += np.random.normal(0.1,0.5,len(test))


In [20]:
def train_test_split(X, y, test_size, random_state=2018):
    """
    split data to train and test
    """
    sss = list(StratifiedShuffleSplit(
        n_splits=1, test_size=test_size, random_state=random_state).split(X, y))
    X_train = np.take(X, sss[0][0], axis=0)
    X_test = np.take(X, sss[0][1], axis=0)
    y_train = np.take(y, sss[0][0], axis=0)
    y_test = np.take(y, sss[0][1], axis=0)
    return [X_train, X_test, y_train, y_test]

In [21]:
def get_fea_importance(clf, feature_name):
    """
    get feature importance from lightGBM
    """
    gain = clf.feature_importance('gain')
    importance_df = pd.DataFrame({
        'feature':clf.feature_name(),
        'split': clf.feature_importance('split'),
        'gain': gain, # * gain / gain.sum(),
        'gain_percent':100 *gain / gain.sum(),
        }).sort_values('gain',ascending=False)
    return importance_df

In [22]:
def adversial_validation(batch1, batch2):
    """
    split two batch to get importance
    """
    feature_name = list(batch1.columns)
    train_X = batch1
    train_Y = np.ones(train_X.shape[0])
    test_X = batch2
    test_Y = np.zeros(test_X.shape[0])
    X = np.concatenate((train_X.values,test_X.values),axis=0)
    y = np.concatenate((train_Y,test_Y),axis=0)
    test_size = int(len(X)/5) 
    X, X_test, y, y_test = train_test_split(X, y, test_size, random_state = 42)
    para = {
        'num_leaves': 6,
        'learning_rate': 0.1,
        'bagging_fraction': 0.2, 
        'feature_fraction': 0.5,
        'max_depth': 3, 
        "objective": "binary", 
        "metric":"auc", 
        'verbose': -1, 
        "seed": 42, 
        'num_threads': 8,
    }
    lgb_train = lgb.Dataset(X, y, free_raw_data=True)
    lgb_val = lgb.Dataset(X_test, y_test, free_raw_data=True, reference=lgb_train)
    lgb_model = lgb.train(para, lgb_train, valid_sets=lgb_val, valid_names='eval',feature_name=feature_name,
                                verbose_eval=False, early_stopping_rounds=10, num_boost_round=50)
    fpr, tpr, thresholds = metrics.roc_curve(
        y_test, lgb_model.predict(X_test, num_iteration = lgb_model.best_iteration))
    auc = metrics.auc(fpr, tpr)
    print("----Adversial Score is {}------".format(auc))
    fea_importance_adversial = get_fea_importance(lgb_model, feature_name)
    print(fea_importance_adversial.head(10))
    return fea_importance_adversial, auc

### get the batch split result, feature importance and auc

In [23]:
fea_imp, auc_true = adversial_validation(batch1, batch2)

----Adversial Score is 0.541772355------
   feature  split         gain  gain_percent
0    col_0     29  2500.973512     87.937494
5    col_5      8    51.052700      1.795080
3    col_3      4    32.793350      1.153057
7    col_7      4    30.666470      1.078273
11  col_11      4    27.965760      0.983313
14  col_14      3    27.519770      0.967631
9    col_9      3    24.406670      0.858170
12  col_12      2    23.328800      0.820271
18  col_18      2    21.233900      0.746612
4    col_4      3    18.196711      0.639820


### Estimate the threshold. We could run more to get a distribution 

In [24]:
estimate_thres_auc = []
estimate_thres_gain = []
for i in range(5):
    len_batch1 = len(batch1) 
    base_df = batch1.append(batch2).reset_index(drop = False).sample(frac=1)
    fea_base, auc_base = adversial_validation(base_df[:len_batch1], base_df[len_batch1:])
    estimate_thres_auc.append(auc_base)
    estimate_thres_gain.append(fea_base['gain'].values[0])

----Adversial Score is 0.50321503------
   feature  split       gain  gain_percent
10   col_9      6  58.554870     17.952197
0    index      5  52.832130     16.197676
12  col_11      4  38.147720     11.695618
14  col_13      3  34.030280     10.433262
16  col_15      3  30.914001      9.477849
5    col_4      2  18.628449      5.711252
4    col_3      2  18.068140      5.539468
15  col_14      2  17.223480      5.280506
11  col_10      2  14.206790      4.355626
18  col_17      2  14.049010      4.307252
----Adversial Score is 0.50557368------
   feature  split      gain  gain_percent
10   col_9      2  24.01961     24.727705
14  col_13      2  14.97471     15.416163
11  col_10      1  12.19990     12.559551
0    index      1  11.64480     11.988087
13  col_12      1  11.42710     11.763970
19  col_18      1   8.28197      8.526121
16  col_15      1   7.31287      7.528453
4    col_3      1   7.27547      7.489950
18  col_17      0   0.00000      0.000000
17  col_16      0   0.00000

In [12]:
estimate_thres_gain

[55.708431243896484,
 117.59082984924316,
 27.317980766296387,
 20.957239627838135,
 74.9548110961914]

In [10]:
#auc threashold
np.mean(estimate_thres_auc)

0.504013757

In [11]:
# drift threashold
np.mean(estimate_thres_gain)

59.305858516693114